{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998383185125303, "eval_steps": 25.0, "global_step": 1236, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "full_loss": 0.5913, "grad_norm": 11.4375, "learning_rate": 6.578947368421053e-07, "long_answer_loss": 0.5913, "loss": 0.5772, "short_answer_loss": NaN, "step": 1, "template_loss": 0.0 }, { "epoch": 0.0, "full_loss": 0.5407, "grad_norm": 11.6875, "learning_rate": 1.3157894736842106e-06, "long_answer_loss": 0.5407, "loss": 0.5984, "short_answer_loss": NaN, "step": 2, "template_loss": 0.0 }, { "epoch": 0.0, "full_loss": 0.5632, "grad_norm": 11.0, "learning_rate": 1.9736842105263157e-06, "long_answer_loss": 0.5632, "loss": 0.5768, "short_answer_loss": NaN, "step": 3, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.5517, "grad_norm": 10.5, "learning_rate": 2.631578947368421e-06, "long_answer_loss": 0.5517, "loss": 0.5593, "short_answer_loss": NaN, "step": 4, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.4862, "grad_norm": 9.0625, "learning_rate": 3.2894736842105265e-06, "long_answer_loss": 0.4862, "loss": 0.5092, "short_answer_loss": NaN, "step": 5, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.501, "grad_norm": 8.25, "learning_rate": 3.9473684210526315e-06, "long_answer_loss": 0.501, "loss": 0.489, "short_answer_loss": NaN, "step": 6, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.4114, "grad_norm": 7.0, "learning_rate": 4.605263157894737e-06, "long_answer_loss": 0.4114, "loss": 0.4271, "short_answer_loss": NaN, "step": 7, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.357, "grad_norm": 5.3125, "learning_rate": 5.263157894736842e-06, "long_answer_loss": 0.357, "loss": 0.3714, "short_answer_loss": NaN, "step": 8, "template_loss": 0.0 }, { "epoch": 0.01, "full_loss": 0.3179, "grad_norm": 6.6875, "learning_rate": 5.921052631578948e-06, "long_answer_loss": 0.3179, "loss": 0.3373, "short_answer_loss": NaN, "step": 9, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2982, "grad_norm": 6.3125, "learning_rate": 6.578947368421053e-06, "long_answer_loss": 0.2982, "loss": 0.3161, "short_answer_loss": NaN, "step": 10, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2744, "grad_norm": 4.96875, "learning_rate": 7.236842105263158e-06, "long_answer_loss": 0.2744, "loss": 0.2775, "short_answer_loss": NaN, "step": 11, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2795, "grad_norm": 4.21875, "learning_rate": 7.894736842105263e-06, "long_answer_loss": 0.2795, "loss": 0.2757, "short_answer_loss": NaN, "step": 12, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2391, "grad_norm": 3.515625, "learning_rate": 8.552631578947368e-06, "long_answer_loss": 0.2391, "loss": 0.241, "short_answer_loss": NaN, "step": 13, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2621, "grad_norm": 3.0625, "learning_rate": 9.210526315789474e-06, "long_answer_loss": 0.2621, "loss": 0.2527, "short_answer_loss": NaN, "step": 14, "template_loss": 0.0 }, { "epoch": 0.02, "full_loss": 0.2162, "grad_norm": 2.875, "learning_rate": 9.868421052631579e-06, "long_answer_loss": 0.2162, "loss": 0.2324, "short_answer_loss": NaN, "step": 15, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.2265, "grad_norm": 3.078125, "learning_rate": 1.0526315789473684e-05, "long_answer_loss": 0.2265, "loss": 0.238, "short_answer_loss": NaN, "step": 16, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.2399, "grad_norm": 2.96875, "learning_rate": 1.118421052631579e-05, "long_answer_loss": 0.2399, "loss": 0.2288, "short_answer_loss": NaN, "step": 17, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.1865, "grad_norm": 2.65625, "learning_rate": 1.1842105263157895e-05, "long_answer_loss": 0.1865, "loss": 0.209, "short_answer_loss": NaN, "step": 18, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.1784, "grad_norm": 2.703125, "learning_rate": 1.25e-05, "long_answer_loss": 0.1784, "loss": 0.2255, "short_answer_loss": NaN, "step": 19, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.2396, "grad_norm": 2.859375, "learning_rate": 1.3157894736842106e-05, "long_answer_loss": 0.2396, "loss": 0.2175, "short_answer_loss": NaN, "step": 20, "template_loss": 0.0 }, { "epoch": 0.03, "full_loss": 0.1953, "grad_norm": 2.796875, "learning_rate": 1.3815789473684213e-05, "long_answer_loss": 0.1953, "loss": 0.2107, "short_answer_loss": NaN, "step": 21, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.1817, "grad_norm": 2.625, "learning_rate": 1.4473684210526317e-05, "long_answer_loss": 0.1817, "loss": 0.2208, "short_answer_loss": NaN, "step": 22, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.1913, "grad_norm": 2.703125, "learning_rate": 1.5131578947368422e-05, "long_answer_loss": 0.1913, "loss": 0.1938, "short_answer_loss": NaN, "step": 23, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.1947, "grad_norm": 2.59375, "learning_rate": 1.5789473684210526e-05, "long_answer_loss": 0.1947, "loss": 0.2017, "short_answer_loss": NaN, "step": 24, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.2353, "grad_norm": 2.59375, "learning_rate": 1.6447368421052635e-05, "long_answer_loss": 0.2353, "loss": 0.2086, "short_answer_loss": NaN, "step": 25, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.2258, "grad_norm": 2.578125, "learning_rate": 1.7105263157894737e-05, "long_answer_loss": 0.2258, "loss": 0.1929, "short_answer_loss": NaN, "step": 26, "template_loss": 0.0 }, { "epoch": 0.04, "full_loss": 0.1916, "grad_norm": 2.8125, "learning_rate": 1.7763157894736842e-05, "long_answer_loss": 0.1916, "loss": 0.1996, "short_answer_loss": NaN, "step": 27, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.185, "grad_norm": 2.78125, "learning_rate": 1.8421052631578947e-05, "long_answer_loss": 0.185, "loss": 0.193, "short_answer_loss": NaN, "step": 28, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.1788, "grad_norm": 2.65625, "learning_rate": 1.9078947368421056e-05, "long_answer_loss": 0.1788, "loss": 0.1975, "short_answer_loss": NaN, "step": 29, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.1976, "grad_norm": 2.609375, "learning_rate": 1.9736842105263158e-05, "long_answer_loss": 0.1976, "loss": 0.1802, "short_answer_loss": NaN, "step": 30, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.204, "grad_norm": 2.40625, "learning_rate": 2.0394736842105264e-05, "long_answer_loss": 0.204, "loss": 0.1897, "short_answer_loss": NaN, "step": 31, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.191, "grad_norm": 2.46875, "learning_rate": 2.105263157894737e-05, "long_answer_loss": 0.191, "loss": 0.2029, "short_answer_loss": NaN, "step": 32, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.1831, "grad_norm": 2.375, "learning_rate": 2.1710526315789474e-05, "long_answer_loss": 0.1831, "loss": 0.1809, "short_answer_loss": NaN, "step": 33, "template_loss": 0.0 }, { "epoch": 0.05, "full_loss": 0.2093, "grad_norm": 2.40625, "learning_rate": 2.236842105263158e-05, "long_answer_loss": 0.2093, "loss": 0.193, "short_answer_loss": NaN, "step": 34, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.2136, "grad_norm": 2.640625, "learning_rate": 2.3026315789473685e-05, "long_answer_loss": 0.2136, "loss": 0.1973, "short_answer_loss": NaN, "step": 35, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.2073, "grad_norm": 2.234375, "learning_rate": 2.368421052631579e-05, "long_answer_loss": 0.2073, "loss": 0.1839, "short_answer_loss": NaN, "step": 36, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.2317, "grad_norm": 2.625, "learning_rate": 2.4342105263157896e-05, "long_answer_loss": 0.2317, "loss": 0.1985, "short_answer_loss": NaN, "step": 37, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.1949, "grad_norm": 2.578125, "learning_rate": 2.5e-05, "long_answer_loss": 0.1949, "loss": 0.1837, "short_answer_loss": NaN, "step": 38, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.1792, "grad_norm": 2.484375, "learning_rate": 2.499995702005279e-05, "long_answer_loss": 0.1792, "loss": 0.1878, "short_answer_loss": NaN, "step": 39, "template_loss": 0.0 }, { "epoch": 0.06, "full_loss": 0.1855, "grad_norm": 2.25, "learning_rate": 2.499982808050672e-05, "long_answer_loss": 0.1855, "loss": 0.1966, "short_answer_loss": NaN, "step": 40, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.1857, "grad_norm": 2.546875, "learning_rate": 2.4999613182248482e-05, "long_answer_loss": 0.1857, "loss": 0.1934, "short_answer_loss": NaN, "step": 41, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.1906, "grad_norm": 2.4375, "learning_rate": 2.499931232675589e-05, "long_answer_loss": 0.1906, "loss": 0.1937, "short_answer_loss": NaN, "step": 42, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.1886, "grad_norm": 2.515625, "learning_rate": 2.499892551609786e-05, "long_answer_loss": 0.1886, "loss": 0.1995, "short_answer_loss": NaN, "step": 43, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.1893, "grad_norm": 2.734375, "learning_rate": 2.499845275293441e-05, "long_answer_loss": 0.1893, "loss": 0.2026, "short_answer_loss": NaN, "step": 44, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.2057, "grad_norm": 2.53125, "learning_rate": 2.499789404051663e-05, "long_answer_loss": 0.2057, "loss": 0.2021, "short_answer_loss": NaN, "step": 45, "template_loss": 0.0 }, { "epoch": 0.07, "full_loss": 0.1896, "grad_norm": 2.5, "learning_rate": 2.4997249382686673e-05, "long_answer_loss": 0.1896, "loss": 0.1881, "short_answer_loss": NaN, "step": 46, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.2066, "grad_norm": 2.46875, "learning_rate": 2.4996518783877716e-05, "long_answer_loss": 0.2066, "loss": 0.1984, "short_answer_loss": NaN, "step": 47, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.1739, "grad_norm": 2.515625, "learning_rate": 2.4995702249113935e-05, "long_answer_loss": 0.1739, "loss": 0.1963, "short_answer_loss": NaN, "step": 48, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.1877, "grad_norm": 2.375, "learning_rate": 2.499479978401047e-05, "long_answer_loss": 0.1877, "loss": 0.1921, "short_answer_loss": NaN, "step": 49, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.1709, "grad_norm": 2.375, "learning_rate": 2.499381139477338e-05, "long_answer_loss": 0.1709, "loss": 0.1892, "short_answer_loss": NaN, "step": 50, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.1591, "grad_norm": 2.53125, "learning_rate": 2.4992737088199623e-05, "long_answer_loss": 0.1591, "loss": 0.1913, "short_answer_loss": NaN, "step": 51, "template_loss": 0.0 }, { "epoch": 0.08, "full_loss": 0.1878, "grad_norm": 2.53125, "learning_rate": 2.499157687167697e-05, "long_answer_loss": 0.1878, "loss": 0.1977, "short_answer_loss": NaN, "step": 52, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.1668, "grad_norm": 2.421875, "learning_rate": 2.499033075318399e-05, "long_answer_loss": 0.1668, "loss": 0.1937, "short_answer_loss": NaN, "step": 53, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.1727, "grad_norm": 2.453125, "learning_rate": 2.4988998741289986e-05, "long_answer_loss": 0.1727, "loss": 0.1807, "short_answer_loss": NaN, "step": 54, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.1721, "grad_norm": 2.9375, "learning_rate": 2.4987580845154922e-05, "long_answer_loss": 0.1721, "loss": 0.1911, "short_answer_loss": NaN, "step": 55, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.2108, "grad_norm": 2.40625, "learning_rate": 2.4986077074529374e-05, "long_answer_loss": 0.2108, "loss": 0.1986, "short_answer_loss": NaN, "step": 56, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.1706, "grad_norm": 2.453125, "learning_rate": 2.498448743975446e-05, "long_answer_loss": 0.1706, "loss": 0.1781, "short_answer_loss": NaN, "step": 57, "template_loss": 0.0 }, { "epoch": 0.09, "full_loss": 0.1635, "grad_norm": 2.09375, "learning_rate": 2.498281195176177e-05, "long_answer_loss": 0.1635, "loss": 0.1883, "short_answer_loss": NaN, "step": 58, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.1799, "grad_norm": 2.546875, "learning_rate": 2.498105062207328e-05, "long_answer_loss": 0.1799, "loss": 0.1854, "short_answer_loss": NaN, "step": 59, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.2089, "grad_norm": 2.515625, "learning_rate": 2.4979203462801287e-05, "long_answer_loss": 0.2089, "loss": 0.1871, "short_answer_loss": NaN, "step": 60, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.2018, "grad_norm": 2.34375, "learning_rate": 2.497727048664833e-05, "long_answer_loss": 0.2018, "loss": 0.1861, "short_answer_loss": NaN, "step": 61, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.203, "grad_norm": 2.40625, "learning_rate": 2.497525170690707e-05, "long_answer_loss": 0.203, "loss": 0.1979, "short_answer_loss": NaN, "step": 62, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.1858, "grad_norm": 2.078125, "learning_rate": 2.4973147137460246e-05, "long_answer_loss": 0.1858, "loss": 0.1876, "short_answer_loss": NaN, "step": 63, "template_loss": 0.0 }, { "epoch": 0.1, "full_loss": 0.1554, "grad_norm": 2.265625, "learning_rate": 2.4970956792780533e-05, "long_answer_loss": 0.1554, "loss": 0.1781, "short_answer_loss": NaN, "step": 64, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.2165, "grad_norm": 2.5, "learning_rate": 2.4968680687930482e-05, "long_answer_loss": 0.2165, "loss": 0.1937, "short_answer_loss": NaN, "step": 65, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.186, "grad_norm": 2.046875, "learning_rate": 2.4966318838562392e-05, "long_answer_loss": 0.186, "loss": 0.1856, "short_answer_loss": NaN, "step": 66, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.1919, "grad_norm": 2.34375, "learning_rate": 2.49638712609182e-05, "long_answer_loss": 0.1919, "loss": 0.1886, "short_answer_loss": NaN, "step": 67, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.1953, "grad_norm": 2.125, "learning_rate": 2.4961337971829397e-05, "long_answer_loss": 0.1953, "loss": 0.1902, "short_answer_loss": NaN, "step": 68, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.1847, "grad_norm": 2.109375, "learning_rate": 2.4958718988716885e-05, "long_answer_loss": 0.1847, "loss": 0.1858, "short_answer_loss": NaN, "step": 69, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.2127, "grad_norm": 2.203125, "learning_rate": 2.4956014329590855e-05, "long_answer_loss": 0.2127, "loss": 0.1936, "short_answer_loss": NaN, "step": 70, "template_loss": 0.0 }, { "epoch": 0.11, "full_loss": 0.1856, "grad_norm": 2.421875, "learning_rate": 2.495322401305069e-05, "long_answer_loss": 0.1856, "loss": 0.1982, "short_answer_loss": NaN, "step": 71, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.1997, "grad_norm": 2.390625, "learning_rate": 2.4950348058284813e-05, "long_answer_loss": 0.1997, "loss": 0.1893, "short_answer_loss": NaN, "step": 72, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.2013, "grad_norm": 2.125, "learning_rate": 2.494738648507057e-05, "long_answer_loss": 0.2013, "loss": 0.1877, "short_answer_loss": NaN, "step": 73, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.1742, "grad_norm": 2.1875, "learning_rate": 2.494433931377408e-05, "long_answer_loss": 0.1742, "loss": 0.1841, "short_answer_loss": NaN, "step": 74, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.2026, "grad_norm": 2.21875, "learning_rate": 2.4941206565350102e-05, "long_answer_loss": 0.2026, "loss": 0.1785, "short_answer_loss": NaN, "step": 75, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.2138, "grad_norm": 2.234375, "learning_rate": 2.49379882613419e-05, "long_answer_loss": 0.2138, "loss": 0.1952, "short_answer_loss": NaN, "step": 76, "template_loss": 0.0 }, { "epoch": 0.12, "full_loss": 0.183, "grad_norm": 2.125, "learning_rate": 2.4934684423881074e-05, "long_answer_loss": 0.183, "loss": 0.1743, "short_answer_loss": NaN, "step": 77, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.1767, "grad_norm": 2.390625, "learning_rate": 2.4931295075687428e-05, "long_answer_loss": 0.1767, "loss": 0.1919, "short_answer_loss": NaN, "step": 78, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.1558, "grad_norm": 2.09375, "learning_rate": 2.4927820240068805e-05, "long_answer_loss": 0.1558, "loss": 0.1802, "short_answer_loss": NaN, "step": 79, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.1912, "grad_norm": 2.234375, "learning_rate": 2.492425994092092e-05, "long_answer_loss": 0.1912, "loss": 0.1875, "short_answer_loss": NaN, "step": 80, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.2034, "grad_norm": 2.125, "learning_rate": 2.4920614202727217e-05, "long_answer_loss": 0.2034, "loss": 0.1925, "short_answer_loss": NaN, "step": 81, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.1561, "grad_norm": 2.09375, "learning_rate": 2.4916883050558664e-05, "long_answer_loss": 0.1561, "loss": 0.1878, "short_answer_loss": NaN, "step": 82, "template_loss": 0.0 }, { "epoch": 0.13, "full_loss": 0.2187, "grad_norm": 2.34375, "learning_rate": 2.491306651007363e-05, "long_answer_loss": 0.2187, "loss": 0.1993, "short_answer_loss": NaN, "step": 83, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.2146, "grad_norm": 2.375, "learning_rate": 2.490916460751766e-05, "long_answer_loss": 0.2146, "loss": 0.1946, "short_answer_loss": NaN, "step": 84, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.1711, "grad_norm": 2.171875, "learning_rate": 2.4905177369723333e-05, "long_answer_loss": 0.1711, "loss": 0.1791, "short_answer_loss": NaN, "step": 85, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.1823, "grad_norm": 2.15625, "learning_rate": 2.4901104824110042e-05, "long_answer_loss": 0.1823, "loss": 0.1865, "short_answer_loss": NaN, "step": 86, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.1902, "grad_norm": 2.171875, "learning_rate": 2.489694699868384e-05, "long_answer_loss": 0.1902, "loss": 0.1932, "short_answer_loss": NaN, "step": 87, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.1712, "grad_norm": 2.09375, "learning_rate": 2.4892703922037225e-05, "long_answer_loss": 0.1712, "loss": 0.185, "short_answer_loss": NaN, "step": 88, "template_loss": 0.0 }, { "epoch": 0.14, "full_loss": 0.1422, "grad_norm": 2.109375, "learning_rate": 2.488837562334895e-05, "long_answer_loss": 0.1422, "loss": 0.1847, "short_answer_loss": NaN, "step": 89, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.1879, "grad_norm": 2.25, "learning_rate": 2.4883962132383823e-05, "long_answer_loss": 0.1879, "loss": 0.19, "short_answer_loss": NaN, "step": 90, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.2042, "grad_norm": 2.09375, "learning_rate": 2.4879463479492504e-05, "long_answer_loss": 0.2042, "loss": 0.1942, "short_answer_loss": NaN, "step": 91, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.2047, "grad_norm": 2.203125, "learning_rate": 2.4874879695611287e-05, "long_answer_loss": 0.2047, "loss": 0.193, "short_answer_loss": NaN, "step": 92, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.1753, "grad_norm": 1.9765625, "learning_rate": 2.4870210812261898e-05, "long_answer_loss": 0.1753, "loss": 0.1829, "short_answer_loss": NaN, "step": 93, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.1802, "grad_norm": 2.015625, "learning_rate": 2.486545686155128e-05, "long_answer_loss": 0.1802, "loss": 0.191, "short_answer_loss": NaN, "step": 94, "template_loss": 0.0 }, { "epoch": 0.15, "full_loss": 0.1443, "grad_norm": 2.171875, "learning_rate": 2.4860617876171355e-05, "long_answer_loss": 0.1443, "loss": 0.18, "short_answer_loss": NaN, "step": 95, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.1755, "grad_norm": 2.109375, "learning_rate": 2.4855693889398822e-05, "long_answer_loss": 0.1755, "loss": 0.1902, "short_answer_loss": NaN, "step": 96, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.1992, "grad_norm": 1.9921875, "learning_rate": 2.485068493509491e-05, "long_answer_loss": 0.1992, "loss": 0.1887, "short_answer_loss": NaN, "step": 97, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.2405, "grad_norm": 2.203125, "learning_rate": 2.4845591047705153e-05, "long_answer_loss": 0.2405, "loss": 0.1958, "short_answer_loss": NaN, "step": 98, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.2091, "grad_norm": 2.203125, "learning_rate": 2.484041226225915e-05, "long_answer_loss": 0.2091, "loss": 0.1872, "short_answer_loss": NaN, "step": 99, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.1757, "grad_norm": 1.9765625, "learning_rate": 2.4835148614370334e-05, "long_answer_loss": 0.1757, "loss": 0.1841, "short_answer_loss": NaN, "step": 100, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.1818, "grad_norm": 2.125, "learning_rate": 2.482980014023571e-05, "long_answer_loss": 0.1818, "loss": 0.1852, "short_answer_loss": NaN, "step": 101, "template_loss": 0.0 }, { "epoch": 0.16, "full_loss": 0.1899, "grad_norm": 2.453125, "learning_rate": 2.4824366876635623e-05, "long_answer_loss": 0.1899, "loss": 0.195, "short_answer_loss": NaN, "step": 102, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.1858, "grad_norm": 2.28125, "learning_rate": 2.481884886093349e-05, "long_answer_loss": 0.1858, "loss": 0.1946, "short_answer_loss": NaN, "step": 103, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.1847, "grad_norm": 2.109375, "learning_rate": 2.4813246131075564e-05, "long_answer_loss": 0.1847, "loss": 0.1895, "short_answer_loss": NaN, "step": 104, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.1691, "grad_norm": 2.1875, "learning_rate": 2.480755872559064e-05, "long_answer_loss": 0.1691, "loss": 0.191, "short_answer_loss": NaN, "step": 105, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.2181, "grad_norm": 2.265625, "learning_rate": 2.4801786683589824e-05, "long_answer_loss": 0.2181, "loss": 0.1879, "short_answer_loss": NaN, "step": 106, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.1856, "grad_norm": 2.25, "learning_rate": 2.4795930044766247e-05, "long_answer_loss": 0.1856, "loss": 0.1899, "short_answer_loss": NaN, "step": 107, "template_loss": 0.0 }, { "epoch": 0.17, "full_loss": 0.1754, "grad_norm": 2.0625, "learning_rate": 2.4789988849394792e-05, "long_answer_loss": 0.1754, "loss": 0.1905, "short_answer_loss": NaN, "step": 108, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.1835, "grad_norm": 2.28125, "learning_rate": 2.478396313833182e-05, "long_answer_loss": 0.1835, "loss": 0.1983, "short_answer_loss": NaN, "step": 109, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.2057, "grad_norm": 2.1875, "learning_rate": 2.4777852953014896e-05, "long_answer_loss": 0.2057, "loss": 0.1931, "short_answer_loss": NaN, "step": 110, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.1497, "grad_norm": 1.9921875, "learning_rate": 2.4771658335462483e-05, "long_answer_loss": 0.1497, "loss": 0.1912, "short_answer_loss": NaN, "step": 111, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.1689, "grad_norm": 2.21875, "learning_rate": 2.476537932827368e-05, "long_answer_loss": 0.1689, "loss": 0.1817, "short_answer_loss": NaN, "step": 112, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.2024, "grad_norm": 1.8515625, "learning_rate": 2.4759015974627906e-05, "long_answer_loss": 0.2024, "loss": 0.1819, "short_answer_loss": NaN, "step": 113, "template_loss": 0.0 }, { "epoch": 0.18, "full_loss": 0.1597, "grad_norm": 2.109375, "learning_rate": 2.475256831828462e-05, "long_answer_loss": 0.1597, "loss": 0.1916, "short_answer_loss": NaN, "step": 114, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1845, "grad_norm": 2.0, "learning_rate": 2.4746036403583012e-05, "long_answer_loss": 0.1845, "loss": 0.1897, "short_answer_loss": NaN, "step": 115, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1652, "grad_norm": 2.0625, "learning_rate": 2.4739420275441694e-05, "long_answer_loss": 0.1652, "loss": 0.1878, "short_answer_loss": NaN, "step": 116, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1572, "grad_norm": 1.9375, "learning_rate": 2.4732719979358403e-05, "long_answer_loss": 0.1572, "loss": 0.1766, "short_answer_loss": NaN, "step": 117, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1945, "grad_norm": 1.984375, "learning_rate": 2.472593556140968e-05, "long_answer_loss": 0.1945, "loss": 0.1812, "short_answer_loss": NaN, "step": 118, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1754, "grad_norm": 1.890625, "learning_rate": 2.4719067068250552e-05, "long_answer_loss": 0.1754, "loss": 0.1834, "short_answer_loss": NaN, "step": 119, "template_loss": 0.0 }, { "epoch": 0.19, "full_loss": 0.1827, "grad_norm": 2.015625, "learning_rate": 2.4712114547114212e-05, "long_answer_loss": 0.1827, "loss": 0.1866, "short_answer_loss": NaN, "step": 120, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.1763, "grad_norm": 1.921875, "learning_rate": 2.4705078045811704e-05, "long_answer_loss": 0.1763, "loss": 0.1771, "short_answer_loss": NaN, "step": 121, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.1739, "grad_norm": 1.9921875, "learning_rate": 2.469795761273157e-05, "long_answer_loss": 0.1739, "loss": 0.1826, "short_answer_loss": NaN, "step": 122, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.1831, "grad_norm": 2.09375, "learning_rate": 2.4690753296839558e-05, "long_answer_loss": 0.1831, "loss": 0.192, "short_answer_loss": NaN, "step": 123, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.1404, "grad_norm": 2.0, "learning_rate": 2.4683465147678235e-05, "long_answer_loss": 0.1404, "loss": 0.1811, "short_answer_loss": NaN, "step": 124, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.1653, "grad_norm": 2.140625, "learning_rate": 2.4676093215366695e-05, "long_answer_loss": 0.1653, "loss": 0.185, "short_answer_loss": NaN, "step": 125, "template_loss": 0.0 }, { "epoch": 0.2, "full_loss": 0.2176, "grad_norm": 2.203125, "learning_rate": 2.466863755060017e-05, "long_answer_loss": 0.2176, "loss": 0.1936, "short_answer_loss": NaN, "step": 126, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.2059, "grad_norm": 1.9609375, "learning_rate": 2.4661098204649717e-05, "long_answer_loss": 0.2059, "loss": 0.1855, "short_answer_loss": NaN, "step": 127, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.1801, "grad_norm": 2.40625, "learning_rate": 2.4653475229361843e-05, "long_answer_loss": 0.1801, "loss": 0.193, "short_answer_loss": NaN, "step": 128, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.1823, "grad_norm": 1.9140625, "learning_rate": 2.4645768677158165e-05, "long_answer_loss": 0.1823, "loss": 0.1789, "short_answer_loss": NaN, "step": 129, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.1603, "grad_norm": 2.046875, "learning_rate": 2.4637978601035033e-05, "long_answer_loss": 0.1603, "loss": 0.1858, "short_answer_loss": NaN, "step": 130, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.1987, "grad_norm": 2.046875, "learning_rate": 2.463010505456318e-05, "long_answer_loss": 0.1987, "loss": 0.1986, "short_answer_loss": NaN, "step": 131, "template_loss": 0.0 }, { "epoch": 0.21, "full_loss": 0.1814, "grad_norm": 2.171875, "learning_rate": 2.4622148091887338e-05, "long_answer_loss": 0.1814, "loss": 0.1878, "short_answer_loss": NaN, "step": 132, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.1871, "grad_norm": 1.8828125, "learning_rate": 2.4614107767725887e-05, "long_answer_loss": 0.1871, "loss": 0.1781, "short_answer_loss": NaN, "step": 133, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.1763, "grad_norm": 2.015625, "learning_rate": 2.4605984137370452e-05, "long_answer_loss": 0.1763, "loss": 0.1949, "short_answer_loss": NaN, "step": 134, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.2048, "grad_norm": 2.234375, "learning_rate": 2.4597777256685556e-05, "long_answer_loss": 0.2048, "loss": 0.1948, "short_answer_loss": NaN, "step": 135, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.1825, "grad_norm": 1.9375, "learning_rate": 2.45894871821082e-05, "long_answer_loss": 0.1825, "loss": 0.1836, "short_answer_loss": NaN, "step": 136, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.1889, "grad_norm": 1.9140625, "learning_rate": 2.45811139706475e-05, "long_answer_loss": 0.1889, "loss": 0.1848, "short_answer_loss": NaN, "step": 137, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.1961, "grad_norm": 1.859375, "learning_rate": 2.4572657679884285e-05, "long_answer_loss": 0.1961, "loss": 0.179, "short_answer_loss": NaN, "step": 138, "template_loss": 0.0 }, { "epoch": 0.22, "full_loss": 0.197, "grad_norm": 1.8359375, "learning_rate": 2.4564118367970706e-05, "long_answer_loss": 0.197, "loss": 0.1927, "short_answer_loss": NaN, "step": 139, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.2099, "grad_norm": 1.9453125, "learning_rate": 2.455549609362983e-05, "long_answer_loss": 0.2099, "loss": 0.1934, "short_answer_loss": NaN, "step": 140, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.1903, "grad_norm": 2.28125, "learning_rate": 2.4546790916155243e-05, "long_answer_loss": 0.1903, "loss": 0.1921, "short_answer_loss": NaN, "step": 141, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.1882, "grad_norm": 1.8984375, "learning_rate": 2.4538002895410634e-05, "long_answer_loss": 0.1882, "loss": 0.1834, "short_answer_loss": NaN, "step": 142, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.2116, "grad_norm": 1.921875, "learning_rate": 2.452913209182939e-05, "long_answer_loss": 0.2116, "loss": 0.1767, "short_answer_loss": NaN, "step": 143, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.1931, "grad_norm": 1.9921875, "learning_rate": 2.4520178566414177e-05, "long_answer_loss": 0.1931, "loss": 0.1955, "short_answer_loss": NaN, "step": 144, "template_loss": 0.0 }, { "epoch": 0.23, "full_loss": 0.1908, "grad_norm": 2.09375, "learning_rate": 2.4511142380736517e-05, "long_answer_loss": 0.1908, "loss": 0.1934, "short_answer_loss": NaN, "step": 145, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.2259, "grad_norm": 1.9140625, "learning_rate": 2.450202359693639e-05, "long_answer_loss": 0.2259, "loss": 0.1822, "short_answer_loss": NaN, "step": 146, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.2343, "grad_norm": 1.953125, "learning_rate": 2.449282227772176e-05, "long_answer_loss": 0.2343, "loss": 0.1911, "short_answer_loss": NaN, "step": 147, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.1845, "grad_norm": 2.015625, "learning_rate": 2.4483538486368186e-05, "long_answer_loss": 0.1845, "loss": 0.1914, "short_answer_loss": NaN, "step": 148, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.1773, "grad_norm": 1.78125, "learning_rate": 2.4474172286718363e-05, "long_answer_loss": 0.1773, "loss": 0.179, "short_answer_loss": NaN, "step": 149, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.187, "grad_norm": 1.8828125, "learning_rate": 2.4464723743181693e-05, "long_answer_loss": 0.187, "loss": 0.1869, "short_answer_loss": NaN, "step": 150, "template_loss": 0.0 }, { "epoch": 0.24, "full_loss": 0.1795, "grad_norm": 2.03125, "learning_rate": 2.445519292073385e-05, "long_answer_loss": 0.1795, "loss": 0.1928, "short_answer_loss": NaN, "step": 151, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.1942, "grad_norm": 1.96875, "learning_rate": 2.4445579884916297e-05, "long_answer_loss": 0.1942, "loss": 0.1897, "short_answer_loss": NaN, "step": 152, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.183, "grad_norm": 1.9140625, "learning_rate": 2.443588470183589e-05, "long_answer_loss": 0.183, "loss": 0.1922, "short_answer_loss": NaN, "step": 153, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.1974, "grad_norm": 1.953125, "learning_rate": 2.442610743816438e-05, "long_answer_loss": 0.1974, "loss": 0.1998, "short_answer_loss": NaN, "step": 154, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.169, "grad_norm": 1.9140625, "learning_rate": 2.4416248161137972e-05, "long_answer_loss": 0.169, "loss": 0.1812, "short_answer_loss": NaN, "step": 155, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.1794, "grad_norm": 1.890625, "learning_rate": 2.4406306938556853e-05, "long_answer_loss": 0.1794, "loss": 0.1882, "short_answer_loss": NaN, "step": 156, "template_loss": 0.0 }, { "epoch": 0.25, "full_loss": 0.1685, "grad_norm": 1.859375, "learning_rate": 2.4396283838784743e-05, "long_answer_loss": 0.1685, "loss": 0.1772, "short_answer_loss": NaN, "step": 157, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1855, "grad_norm": 2.015625, "learning_rate": 2.438617893074841e-05, "long_answer_loss": 0.1855, "loss": 0.1951, "short_answer_loss": NaN, "step": 158, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1648, "grad_norm": 1.8046875, "learning_rate": 2.4375992283937194e-05, "long_answer_loss": 0.1648, "loss": 0.1752, "short_answer_loss": NaN, "step": 159, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1721, "grad_norm": 1.8359375, "learning_rate": 2.4365723968402552e-05, "long_answer_loss": 0.1721, "loss": 0.1834, "short_answer_loss": NaN, "step": 160, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1973, "grad_norm": 1.9375, "learning_rate": 2.4355374054757546e-05, "long_answer_loss": 0.1973, "loss": 0.1833, "short_answer_loss": NaN, "step": 161, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1963, "grad_norm": 1.84375, "learning_rate": 2.434494261417637e-05, "long_answer_loss": 0.1963, "loss": 0.1867, "short_answer_loss": NaN, "step": 162, "template_loss": 0.0 }, { "epoch": 0.26, "full_loss": 0.1969, "grad_norm": 1.78125, "learning_rate": 2.433442971839387e-05, "long_answer_loss": 0.1969, "loss": 0.1884, "short_answer_loss": NaN, "step": 163, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.2025, "grad_norm": 2.109375, "learning_rate": 2.432383543970504e-05, "long_answer_loss": 0.2025, "loss": 0.1926, "short_answer_loss": NaN, "step": 164, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.2021, "grad_norm": 1.859375, "learning_rate": 2.4313159850964523e-05, "long_answer_loss": 0.2021, "loss": 0.175, "short_answer_loss": NaN, "step": 165, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.159, "grad_norm": 1.90625, "learning_rate": 2.4302403025586122e-05, "long_answer_loss": 0.159, "loss": 0.1889, "short_answer_loss": NaN, "step": 166, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.1843, "grad_norm": 2.046875, "learning_rate": 2.429156503754228e-05, "long_answer_loss": 0.1843, "loss": 0.1956, "short_answer_loss": NaN, "step": 167, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.1944, "grad_norm": 1.8203125, "learning_rate": 2.428064596136358e-05, "long_answer_loss": 0.1944, "loss": 0.1856, "short_answer_loss": NaN, "step": 168, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.1753, "grad_norm": 1.78125, "learning_rate": 2.4269645872138237e-05, "long_answer_loss": 0.1753, "loss": 0.1751, "short_answer_loss": NaN, "step": 169, "template_loss": 0.0 }, { "epoch": 0.27, "full_loss": 0.1986, "grad_norm": 1.7890625, "learning_rate": 2.4258564845511568e-05, "long_answer_loss": 0.1986, "loss": 0.1929, "short_answer_loss": NaN, "step": 170, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.1973, "grad_norm": 1.9375, "learning_rate": 2.4247402957685482e-05, "long_answer_loss": 0.1973, "loss": 0.1852, "short_answer_loss": NaN, "step": 171, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.2181, "grad_norm": 2.0, "learning_rate": 2.4236160285417964e-05, "long_answer_loss": 0.2181, "loss": 0.2004, "short_answer_loss": NaN, "step": 172, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.1811, "grad_norm": 1.890625, "learning_rate": 2.4224836906022518e-05, "long_answer_loss": 0.1811, "loss": 0.1879, "short_answer_loss": NaN, "step": 173, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.1941, "grad_norm": 1.8828125, "learning_rate": 2.421343289736767e-05, "long_answer_loss": 0.1941, "loss": 0.1896, "short_answer_loss": NaN, "step": 174, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.1872, "grad_norm": 1.9140625, "learning_rate": 2.4201948337876405e-05, "long_answer_loss": 0.1872, "loss": 0.1869, "short_answer_loss": NaN, "step": 175, "template_loss": 0.0 }, { "epoch": 0.28, "full_loss": 0.2018, "grad_norm": 2.0, "learning_rate": 2.4190383306525647e-05, "long_answer_loss": 0.2018, "loss": 0.188, "short_answer_loss": NaN, "step": 176, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.1861, "grad_norm": 1.921875, "learning_rate": 2.4178737882845708e-05, "long_answer_loss": 0.1861, "loss": 0.1856, "short_answer_loss": NaN, "step": 177, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.2307, "grad_norm": 2.0, "learning_rate": 2.4167012146919735e-05, "long_answer_loss": 0.2307, "loss": 0.1893, "short_answer_loss": NaN, "step": 178, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.1916, "grad_norm": 1.96875, "learning_rate": 2.4155206179383172e-05, "long_answer_loss": 0.1916, "loss": 0.1842, "short_answer_loss": NaN, "step": 179, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.2011, "grad_norm": 1.890625, "learning_rate": 2.41433200614232e-05, "long_answer_loss": 0.2011, "loss": 0.192, "short_answer_loss": NaN, "step": 180, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.1894, "grad_norm": 2.03125, "learning_rate": 2.4131353874778168e-05, "long_answer_loss": 0.1894, "loss": 0.1856, "short_answer_loss": NaN, "step": 181, "template_loss": 0.0 }, { "epoch": 0.29, "full_loss": 0.1915, "grad_norm": 2.0625, "learning_rate": 2.4119307701737053e-05, "long_answer_loss": 0.1915, "loss": 0.196, "short_answer_loss": NaN, "step": 182, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.2175, "grad_norm": 1.7421875, "learning_rate": 2.4107181625138874e-05, "long_answer_loss": 0.2175, "loss": 0.1841, "short_answer_loss": NaN, "step": 183, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.1877, "grad_norm": 1.8984375, "learning_rate": 2.4094975728372133e-05, "long_answer_loss": 0.1877, "loss": 0.1905, "short_answer_loss": NaN, "step": 184, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.2038, "grad_norm": 1.859375, "learning_rate": 2.4082690095374234e-05, "long_answer_loss": 0.2038, "loss": 0.1906, "short_answer_loss": NaN, "step": 185, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.2006, "grad_norm": 1.78125, "learning_rate": 2.407032481063092e-05, "long_answer_loss": 0.2006, "loss": 0.1848, "short_answer_loss": NaN, "step": 186, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.2446, "grad_norm": 1.984375, "learning_rate": 2.4057879959175672e-05, "long_answer_loss": 0.2446, "loss": 0.196, "short_answer_loss": NaN, "step": 187, "template_loss": 0.0 }, { "epoch": 0.3, "full_loss": 0.199, "grad_norm": 1.8359375, "learning_rate": 2.4045355626589145e-05, "long_answer_loss": 0.199, "loss": 0.1846, "short_answer_loss": NaN, "step": 188, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.2025, "grad_norm": 2.265625, "learning_rate": 2.4032751898998555e-05, "long_answer_loss": 0.2025, "loss": 0.1986, "short_answer_loss": NaN, "step": 189, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.1821, "grad_norm": 2.03125, "learning_rate": 2.4020068863077116e-05, "long_answer_loss": 0.1821, "loss": 0.1906, "short_answer_loss": NaN, "step": 190, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.1585, "grad_norm": 1.7734375, "learning_rate": 2.4007306606043416e-05, "long_answer_loss": 0.1585, "loss": 0.1821, "short_answer_loss": NaN, "step": 191, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.1971, "grad_norm": 1.8515625, "learning_rate": 2.3994465215660846e-05, "long_answer_loss": 0.1971, "loss": 0.1865, "short_answer_loss": NaN, "step": 192, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.1936, "grad_norm": 1.90625, "learning_rate": 2.3981544780236963e-05, "long_answer_loss": 0.1936, "loss": 0.1786, "short_answer_loss": NaN, "step": 193, "template_loss": 0.0 }, { "epoch": 0.31, "full_loss": 0.1601, "grad_norm": 1.96875, "learning_rate": 2.3968545388622917e-05, "long_answer_loss": 0.1601, "loss": 0.184, "short_answer_loss": NaN, "step": 194, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.1794, "grad_norm": 1.859375, "learning_rate": 2.39554671302128e-05, "long_answer_loss": 0.1794, "loss": 0.1826, "short_answer_loss": NaN, "step": 195, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.2061, "grad_norm": 1.9453125, "learning_rate": 2.3942310094943083e-05, "long_answer_loss": 0.2061, "loss": 0.1856, "short_answer_loss": NaN, "step": 196, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.1974, "grad_norm": 1.8671875, "learning_rate": 2.3929074373291946e-05, "long_answer_loss": 0.1974, "loss": 0.188, "short_answer_loss": NaN, "step": 197, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.1961, "grad_norm": 2.109375, "learning_rate": 2.391576005627869e-05, "long_answer_loss": 0.1961, "loss": 0.1941, "short_answer_loss": NaN, "step": 198, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.1659, "grad_norm": 2.03125, "learning_rate": 2.3902367235463104e-05, "long_answer_loss": 0.1659, "loss": 0.1898, "short_answer_loss": NaN, "step": 199, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.221, "grad_norm": 2.015625, "learning_rate": 2.3888896002944815e-05, "long_answer_loss": 0.221, "loss": 0.195, "short_answer_loss": NaN, "step": 200, "template_loss": 0.0 }, { "epoch": 0.32, "full_loss": 0.2119, "grad_norm": 1.9140625, "learning_rate": 2.387534645136269e-05, "long_answer_loss": 0.2119, "loss": 0.1854, "short_answer_loss": NaN, "step": 201, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.2143, "grad_norm": 1.703125, "learning_rate": 2.3861718673894166e-05, "long_answer_loss": 0.2143, "loss": 0.1867, "short_answer_loss": NaN, "step": 202, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.1887, "grad_norm": 2.125, "learning_rate": 2.384801276425463e-05, "long_answer_loss": 0.1887, "loss": 0.1929, "short_answer_loss": NaN, "step": 203, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.1986, "grad_norm": 2.09375, "learning_rate": 2.3834228816696763e-05, "long_answer_loss": 0.1986, "loss": 0.1885, "short_answer_loss": NaN, "step": 204, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.2079, "grad_norm": 1.8515625, "learning_rate": 2.3820366926009903e-05, "long_answer_loss": 0.2079, "loss": 0.1958, "short_answer_loss": NaN, "step": 205, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.1936, "grad_norm": 2.078125, "learning_rate": 2.3806427187519376e-05, "long_answer_loss": 0.1936, "loss": 0.1938, "short_answer_loss": NaN, "step": 206, "template_loss": 0.0 }, { "epoch": 0.33, "full_loss": 0.1903, "grad_norm": 1.859375, "learning_rate": 2.3792409697085866e-05, "long_answer_loss": 0.1903, "loss": 0.1859, "short_answer_loss": NaN, "step": 207, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1969, "grad_norm": 2.09375, "learning_rate": 2.3778314551104725e-05, "long_answer_loss": 0.1969, "loss": 0.1914, "short_answer_loss": NaN, "step": 208, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1905, "grad_norm": 1.9921875, "learning_rate": 2.376414184650534e-05, "long_answer_loss": 0.1905, "loss": 0.1832, "short_answer_loss": NaN, "step": 209, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1684, "grad_norm": 1.78125, "learning_rate": 2.3749891680750445e-05, "long_answer_loss": 0.1684, "loss": 0.1765, "short_answer_loss": NaN, "step": 210, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1781, "grad_norm": 2.03125, "learning_rate": 2.3735564151835462e-05, "long_answer_loss": 0.1781, "loss": 0.1868, "short_answer_loss": NaN, "step": 211, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1583, "grad_norm": 1.9140625, "learning_rate": 2.3721159358287815e-05, "long_answer_loss": 0.1583, "loss": 0.1739, "short_answer_loss": NaN, "step": 212, "template_loss": 0.0 }, { "epoch": 0.34, "full_loss": 0.1895, "grad_norm": 1.96875, "learning_rate": 2.370667739916627e-05, "long_answer_loss": 0.1895, "loss": 0.1963, "short_answer_loss": NaN, "step": 213, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.1893, "grad_norm": 1.8671875, "learning_rate": 2.369211837406024e-05, "long_answer_loss": 0.1893, "loss": 0.1824, "short_answer_loss": NaN, "step": 214, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.177, "grad_norm": 1.796875, "learning_rate": 2.3677482383089105e-05, "long_answer_loss": 0.177, "loss": 0.1841, "short_answer_loss": NaN, "step": 215, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.1768, "grad_norm": 1.8828125, "learning_rate": 2.3662769526901526e-05, "long_answer_loss": 0.1768, "loss": 0.1736, "short_answer_loss": NaN, "step": 216, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.1741, "grad_norm": 2.015625, "learning_rate": 2.364797990667475e-05, "long_answer_loss": 0.1741, "loss": 0.1913, "short_answer_loss": NaN, "step": 217, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.1897, "grad_norm": 1.9296875, "learning_rate": 2.3633113624113908e-05, "long_answer_loss": 0.1897, "loss": 0.1854, "short_answer_loss": NaN, "step": 218, "template_loss": 0.0 }, { "epoch": 0.35, "full_loss": 0.1796, "grad_norm": 1.9296875, "learning_rate": 2.3618170781451328e-05, "long_answer_loss": 0.1796, "loss": 0.1837, "short_answer_loss": NaN, "step": 219, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.1461, "grad_norm": 1.921875, "learning_rate": 2.3603151481445823e-05, "long_answer_loss": 0.1461, "loss": 0.1872, "short_answer_loss": NaN, "step": 220, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.1872, "grad_norm": 1.8984375, "learning_rate": 2.3588055827381995e-05, "long_answer_loss": 0.1872, "loss": 0.1828, "short_answer_loss": NaN, "step": 221, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.1991, "grad_norm": 1.984375, "learning_rate": 2.35728839230695e-05, "long_answer_loss": 0.1991, "loss": 0.1823, "short_answer_loss": NaN, "step": 222, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.2084, "grad_norm": 1.8203125, "learning_rate": 2.3557635872842372e-05, "long_answer_loss": 0.2084, "loss": 0.1903, "short_answer_loss": NaN, "step": 223, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.1454, "grad_norm": 1.8515625, "learning_rate": 2.3542311781558263e-05, "long_answer_loss": 0.1454, "loss": 0.1738, "short_answer_loss": NaN, "step": 224, "template_loss": 0.0 }, { "epoch": 0.36, "full_loss": 0.2094, "grad_norm": 1.8828125, "learning_rate": 2.3526911754597763e-05, "long_answer_loss": 0.2094, "loss": 0.1725, "short_answer_loss": NaN, "step": 225, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.1653, "grad_norm": 1.7578125, "learning_rate": 2.3511435897863647e-05, "long_answer_loss": 0.1653, "loss": 0.1774, "short_answer_loss": NaN, "step": 226, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.1597, "grad_norm": 1.890625, "learning_rate": 2.3495884317780154e-05, "long_answer_loss": 0.1597, "loss": 0.1845, "short_answer_loss": NaN, "step": 227, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.1791, "grad_norm": 1.6796875, "learning_rate": 2.3480257121292254e-05, "long_answer_loss": 0.1791, "loss": 0.1814, "short_answer_loss": NaN, "step": 228, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.161, "grad_norm": 1.7421875, "learning_rate": 2.3464554415864927e-05, "long_answer_loss": 0.161, "loss": 0.1722, "short_answer_loss": NaN, "step": 229, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.1547, "grad_norm": 1.890625, "learning_rate": 2.3448776309482402e-05, "long_answer_loss": 0.1547, "loss": 0.1762, "short_answer_loss": NaN, "step": 230, "template_loss": 0.0 }, { "epoch": 0.37, "full_loss": 0.163, "grad_norm": 1.984375, "learning_rate": 2.3432922910647426e-05, "long_answer_loss": 0.163, "loss": 0.1701, "short_answer_loss": NaN, "step": 231, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1822, "grad_norm": 1.9140625, "learning_rate": 2.341699432838052e-05, "long_answer_loss": 0.1822, "loss": 0.1915, "short_answer_loss": NaN, "step": 232, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1643, "grad_norm": 1.890625, "learning_rate": 2.3400990672219226e-05, "long_answer_loss": 0.1643, "loss": 0.1743, "short_answer_loss": NaN, "step": 233, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1864, "grad_norm": 1.875, "learning_rate": 2.3384912052217345e-05, "long_answer_loss": 0.1864, "loss": 0.1837, "short_answer_loss": NaN, "step": 234, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.2088, "grad_norm": 1.78125, "learning_rate": 2.3368758578944205e-05, "long_answer_loss": 0.2088, "loss": 0.1754, "short_answer_loss": NaN, "step": 235, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1659, "grad_norm": 1.6953125, "learning_rate": 2.3352530363483866e-05, "long_answer_loss": 0.1659, "loss": 0.1826, "short_answer_loss": NaN, "step": 236, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1785, "grad_norm": 1.8046875, "learning_rate": 2.3336227517434385e-05, "long_answer_loss": 0.1785, "loss": 0.1885, "short_answer_loss": NaN, "step": 237, "template_loss": 0.0 }, { "epoch": 0.38, "full_loss": 0.1631, "grad_norm": 1.765625, "learning_rate": 2.331985015290704e-05, "long_answer_loss": 0.1631, "loss": 0.1812, "short_answer_loss": NaN, "step": 238, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1705, "grad_norm": 1.8359375, "learning_rate": 2.330339838252555e-05, "long_answer_loss": 0.1705, "loss": 0.1753, "short_answer_loss": NaN, "step": 239, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1575, "grad_norm": 1.8984375, "learning_rate": 2.3286872319425312e-05, "long_answer_loss": 0.1575, "loss": 0.1864, "short_answer_loss": NaN, "step": 240, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1795, "grad_norm": 1.90625, "learning_rate": 2.3270272077252613e-05, "long_answer_loss": 0.1795, "loss": 0.18, "short_answer_loss": NaN, "step": 241, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1681, "grad_norm": 1.8671875, "learning_rate": 2.3253597770163866e-05, "long_answer_loss": 0.1681, "loss": 0.1814, "short_answer_loss": NaN, "step": 242, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1861, "grad_norm": 1.8671875, "learning_rate": 2.3236849512824793e-05, "long_answer_loss": 0.1861, "loss": 0.184, "short_answer_loss": NaN, "step": 243, "template_loss": 0.0 }, { "epoch": 0.39, "full_loss": 0.1786, "grad_norm": 2.0, "learning_rate": 2.322002742040968e-05, "long_answer_loss": 0.1786, "loss": 0.1818, "short_answer_loss": NaN, "step": 244, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.1927, "grad_norm": 2.078125, "learning_rate": 2.3203131608600548e-05, "long_answer_loss": 0.1927, "loss": 0.1988, "short_answer_loss": NaN, "step": 245, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.1746, "grad_norm": 1.7265625, "learning_rate": 2.318616219358637e-05, "long_answer_loss": 0.1746, "loss": 0.1711, "short_answer_loss": NaN, "step": 246, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.159, "grad_norm": 1.875, "learning_rate": 2.3169119292062273e-05, "long_answer_loss": 0.159, "loss": 0.181, "short_answer_loss": NaN, "step": 247, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.1809, "grad_norm": 1.9296875, "learning_rate": 2.3152003021228746e-05, "long_answer_loss": 0.1809, "loss": 0.1942, "short_answer_loss": NaN, "step": 248, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.1779, "grad_norm": 1.71875, "learning_rate": 2.3134813498790814e-05, "long_answer_loss": 0.1779, "loss": 0.1757, "short_answer_loss": NaN, "step": 249, "template_loss": 0.0 }, { "epoch": 0.4, "full_loss": 0.1589, "grad_norm": 1.8203125, "learning_rate": 2.311755084295723e-05, "long_answer_loss": 0.1589, "loss": 0.1693, "short_answer_loss": NaN, "step": 250, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.1904, "grad_norm": 1.953125, "learning_rate": 2.3100215172439693e-05, "long_answer_loss": 0.1904, "loss": 0.1824, "short_answer_loss": NaN, "step": 251, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.1714, "grad_norm": 1.90625, "learning_rate": 2.308280660645199e-05, "long_answer_loss": 0.1714, "loss": 0.1849, "short_answer_loss": NaN, "step": 252, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.203, "grad_norm": 1.7890625, "learning_rate": 2.3065325264709196e-05, "long_answer_loss": 0.203, "loss": 0.1783, "short_answer_loss": NaN, "step": 253, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.1694, "grad_norm": 2.015625, "learning_rate": 2.3047771267426866e-05, "long_answer_loss": 0.1694, "loss": 0.1777, "short_answer_loss": NaN, "step": 254, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.2025, "grad_norm": 1.8828125, "learning_rate": 2.303014473532017e-05, "long_answer_loss": 0.2025, "loss": 0.1794, "short_answer_loss": NaN, "step": 255, "template_loss": 0.0 }, { "epoch": 0.41, "full_loss": 0.1735, "grad_norm": 1.71875, "learning_rate": 2.3012445789603093e-05, "long_answer_loss": 0.1735, "loss": 0.1752, "short_answer_loss": NaN, "step": 256, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.1522, "grad_norm": 2.015625, "learning_rate": 2.2994674551987605e-05, "long_answer_loss": 0.1522, "loss": 0.1789, "short_answer_loss": NaN, "step": 257, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.1737, "grad_norm": 1.9765625, "learning_rate": 2.2976831144682797e-05, "long_answer_loss": 0.1737, "loss": 0.18, "short_answer_loss": NaN, "step": 258, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.1774, "grad_norm": 1.8671875, "learning_rate": 2.295891569039406e-05, "long_answer_loss": 0.1774, "loss": 0.1879, "short_answer_loss": NaN, "step": 259, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.1798, "grad_norm": 1.8359375, "learning_rate": 2.2940928312322246e-05, "long_answer_loss": 0.1798, "loss": 0.1803, "short_answer_loss": NaN, "step": 260, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.2178, "grad_norm": 1.921875, "learning_rate": 2.29228691341628e-05, "long_answer_loss": 0.2178, "loss": 0.1801, "short_answer_loss": NaN, "step": 261, "template_loss": 0.0 }, { "epoch": 0.42, "full_loss": 0.1985, "grad_norm": 1.90625, "learning_rate": 2.2904738280104927e-05, "long_answer_loss": 0.1985, "loss": 0.1893, "short_answer_loss": NaN, "step": 262, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.1651, "grad_norm": 1.7578125, "learning_rate": 2.2886535874830726e-05, "long_answer_loss": 0.1651, "loss": 0.1812, "short_answer_loss": NaN, "step": 263, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.2057, "grad_norm": 1.7578125, "learning_rate": 2.286826204351435e-05, "long_answer_loss": 0.2057, "loss": 0.1824, "short_answer_loss": NaN, "step": 264, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.1897, "grad_norm": 1.9453125, "learning_rate": 2.284991691182113e-05, "long_answer_loss": 0.1897, "loss": 0.1896, "short_answer_loss": NaN, "step": 265, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.1933, "grad_norm": 1.890625, "learning_rate": 2.2831500605906702e-05, "long_answer_loss": 0.1933, "loss": 0.1734, "short_answer_loss": NaN, "step": 266, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.1577, "grad_norm": 1.7109375, "learning_rate": 2.281301325241617e-05, "long_answer_loss": 0.1577, "loss": 0.1691, "short_answer_loss": NaN, "step": 267, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.161, "grad_norm": 1.9375, "learning_rate": 2.279445497848321e-05, "long_answer_loss": 0.161, "loss": 0.181, "short_answer_loss": NaN, "step": 268, "template_loss": 0.0 }, { "epoch": 0.43, "full_loss": 0.1905, "grad_norm": 2.046875, "learning_rate": 2.2775825911729207e-05, "long_answer_loss": 0.1905, "loss": 0.1961, "short_answer_loss": NaN, "step": 269, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1823, "grad_norm": 1.75, "learning_rate": 2.275712618026236e-05, "long_answer_loss": 0.1823, "loss": 0.1751, "short_answer_loss": NaN, "step": 270, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1653, "grad_norm": 1.8984375, "learning_rate": 2.2738355912676838e-05, "long_answer_loss": 0.1653, "loss": 0.1805, "short_answer_loss": NaN, "step": 271, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1818, "grad_norm": 1.859375, "learning_rate": 2.2719515238051846e-05, "long_answer_loss": 0.1818, "loss": 0.1761, "short_answer_loss": NaN, "step": 272, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1863, "grad_norm": 1.984375, "learning_rate": 2.2700604285950783e-05, "long_answer_loss": 0.1863, "loss": 0.1864, "short_answer_loss": NaN, "step": 273, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1573, "grad_norm": 2.078125, "learning_rate": 2.2681623186420323e-05, "long_answer_loss": 0.1573, "loss": 0.1811, "short_answer_loss": NaN, "step": 274, "template_loss": 0.0 }, { "epoch": 0.44, "full_loss": 0.1858, "grad_norm": 1.71875, "learning_rate": 2.266257206998953e-05, "long_answer_loss": 0.1858, "loss": 0.1731, "short_answer_loss": NaN, "step": 275, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.185, "grad_norm": 2.125, "learning_rate": 2.264345106766896e-05, "long_answer_loss": 0.185, "loss": 0.1823, "short_answer_loss": NaN, "step": 276, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.204, "grad_norm": 2.03125, "learning_rate": 2.2624260310949763e-05, "long_answer_loss": 0.204, "loss": 0.1764, "short_answer_loss": NaN, "step": 277, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.1896, "grad_norm": 1.7265625, "learning_rate": 2.2604999931802773e-05, "long_answer_loss": 0.1896, "loss": 0.1822, "short_answer_loss": NaN, "step": 278, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.191, "grad_norm": 1.859375, "learning_rate": 2.25856700626776e-05, "long_answer_loss": 0.191, "loss": 0.1832, "short_answer_loss": NaN, "step": 279, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.1685, "grad_norm": 1.90625, "learning_rate": 2.2566270836501725e-05, "long_answer_loss": 0.1685, "loss": 0.1778, "short_answer_loss": NaN, "step": 280, "template_loss": 0.0 }, { "epoch": 0.45, "full_loss": 0.171, "grad_norm": 1.6875, "learning_rate": 2.2546802386679585e-05, "long_answer_loss": 0.171, "loss": 0.1776, "short_answer_loss": NaN, "step": 281, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.2089, "grad_norm": 1.8125, "learning_rate": 2.2527264847091652e-05, "long_answer_loss": 0.2089, "loss": 0.1911, "short_answer_loss": NaN, "step": 282, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.1788, "grad_norm": 1.7578125, "learning_rate": 2.2507658352093503e-05, "long_answer_loss": 0.1788, "loss": 0.1738, "short_answer_loss": NaN, "step": 283, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.1825, "grad_norm": 1.921875, "learning_rate": 2.2487983036514932e-05, "long_answer_loss": 0.1825, "loss": 0.179, "short_answer_loss": NaN, "step": 284, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.1738, "grad_norm": 1.828125, "learning_rate": 2.2468239035658972e-05, "long_answer_loss": 0.1738, "loss": 0.1746, "short_answer_loss": NaN, "step": 285, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.1766, "grad_norm": 1.75, "learning_rate": 2.2448426485301006e-05, "long_answer_loss": 0.1766, "loss": 0.1774, "short_answer_loss": NaN, "step": 286, "template_loss": 0.0 }, { "epoch": 0.46, "full_loss": 0.1972, "grad_norm": 1.6171875, "learning_rate": 2.2428545521687816e-05, "long_answer_loss": 0.1972, "loss": 0.1653, "short_answer_loss": NaN, "step": 287, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.1686, "grad_norm": 1.8984375, "learning_rate": 2.2408596281536638e-05, "long_answer_loss": 0.1686, "loss": 0.1737, "short_answer_loss": NaN, "step": 288, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.1981, "grad_norm": 1.8984375, "learning_rate": 2.2388578902034243e-05, "long_answer_loss": 0.1981, "loss": 0.1735, "short_answer_loss": NaN, "step": 289, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.1923, "grad_norm": 1.9453125, "learning_rate": 2.2368493520835977e-05, "long_answer_loss": 0.1923, "loss": 0.1842, "short_answer_loss": NaN, "step": 290, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.2005, "grad_norm": 1.828125, "learning_rate": 2.2348340276064816e-05, "long_answer_loss": 0.2005, "loss": 0.1797, "short_answer_loss": NaN, "step": 291, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.179, "grad_norm": 1.9296875, "learning_rate": 2.2328119306310423e-05, "long_answer_loss": 0.179, "loss": 0.1784, "short_answer_loss": NaN, "step": 292, "template_loss": 0.0 }, { "epoch": 0.47, "full_loss": 0.1715, "grad_norm": 1.8046875, "learning_rate": 2.23078307506282e-05, "long_answer_loss": 0.1715, "loss": 0.1694, "short_answer_loss": NaN, "step": 293, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.1441, "grad_norm": 1.703125, "learning_rate": 2.2287474748538308e-05, "long_answer_loss": 0.1441, "loss": 0.1689, "short_answer_loss": NaN, "step": 294, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.1673, "grad_norm": 1.875, "learning_rate": 2.2267051440024734e-05, "long_answer_loss": 0.1673, "loss": 0.178, "short_answer_loss": NaN, "step": 295, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.2067, "grad_norm": 1.796875, "learning_rate": 2.2246560965534312e-05, "long_answer_loss": 0.2067, "loss": 0.1841, "short_answer_loss": NaN, "step": 296, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.1729, "grad_norm": 1.75, "learning_rate": 2.222600346597576e-05, "long_answer_loss": 0.1729, "loss": 0.1665, "short_answer_loss": NaN, "step": 297, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.1546, "grad_norm": 1.921875, "learning_rate": 2.2205379082718725e-05, "long_answer_loss": 0.1546, "loss": 0.1866, "short_answer_loss": NaN, "step": 298, "template_loss": 0.0 }, { "epoch": 0.48, "full_loss": 0.1767, "grad_norm": 1.765625, "learning_rate": 2.2184687957592786e-05, "long_answer_loss": 0.1767, "loss": 0.1773, "short_answer_loss": NaN, "step": 299, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1712, "grad_norm": 1.7578125, "learning_rate": 2.21639302328865e-05, "long_answer_loss": 0.1712, "loss": 0.1779, "short_answer_loss": NaN, "step": 300, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1846, "grad_norm": 1.9609375, "learning_rate": 2.2143106051346407e-05, "long_answer_loss": 0.1846, "loss": 0.1862, "short_answer_loss": NaN, "step": 301, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1772, "grad_norm": 1.8671875, "learning_rate": 2.2122215556176074e-05, "long_answer_loss": 0.1772, "loss": 0.1744, "short_answer_loss": NaN, "step": 302, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.172, "grad_norm": 1.765625, "learning_rate": 2.2101258891035075e-05, "long_answer_loss": 0.172, "loss": 0.1764, "short_answer_loss": NaN, "step": 303, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1873, "grad_norm": 1.9609375, "learning_rate": 2.2080236200038026e-05, "long_answer_loss": 0.1873, "loss": 0.1825, "short_answer_loss": NaN, "step": 304, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1944, "grad_norm": 1.96875, "learning_rate": 2.2059147627753595e-05, "long_answer_loss": 0.1944, "loss": 0.1785, "short_answer_loss": NaN, "step": 305, "template_loss": 0.0 }, { "epoch": 0.49, "full_loss": 0.1921, "grad_norm": 1.765625, "learning_rate": 2.2037993319203498e-05, "long_answer_loss": 0.1921, "loss": 0.1716, "short_answer_loss": NaN, "step": 306, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1942, "grad_norm": 1.9765625, "learning_rate": 2.20167734198615e-05, "long_answer_loss": 0.1942, "loss": 0.1826, "short_answer_loss": NaN, "step": 307, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1896, "grad_norm": 2.015625, "learning_rate": 2.1995488075652433e-05, "long_answer_loss": 0.1896, "loss": 0.1821, "short_answer_loss": NaN, "step": 308, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1871, "grad_norm": 1.9140625, "learning_rate": 2.1974137432951165e-05, "long_answer_loss": 0.1871, "loss": 0.1877, "short_answer_loss": NaN, "step": 309, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1826, "grad_norm": 1.734375, "learning_rate": 2.195272163858162e-05, "long_answer_loss": 0.1826, "loss": 0.1769, "short_answer_loss": NaN, "step": 310, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1859, "grad_norm": 2.03125, "learning_rate": 2.193124083981575e-05, "long_answer_loss": 0.1859, "loss": 0.1716, "short_answer_loss": NaN, "step": 311, "template_loss": 0.0 }, { "epoch": 0.5, "full_loss": 0.1858, "grad_norm": 1.765625, "learning_rate": 2.190969518437253e-05, "long_answer_loss": 0.1858, "loss": 0.1744, "short_answer_loss": NaN, "step": 312, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1558, "grad_norm": 1.7578125, "learning_rate": 2.1888084820416944e-05, "long_answer_loss": 0.1558, "loss": 0.1725, "short_answer_loss": NaN, "step": 313, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1731, "grad_norm": 1.8359375, "learning_rate": 2.186640989655896e-05, "long_answer_loss": 0.1731, "loss": 0.189, "short_answer_loss": NaN, "step": 314, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1705, "grad_norm": 1.7734375, "learning_rate": 2.1844670561852508e-05, "long_answer_loss": 0.1705, "loss": 0.1745, "short_answer_loss": NaN, "step": 315, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1566, "grad_norm": 2.046875, "learning_rate": 2.1822866965794465e-05, "long_answer_loss": 0.1566, "loss": 0.1757, "short_answer_loss": NaN, "step": 316, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1705, "grad_norm": 1.953125, "learning_rate": 2.180099925832361e-05, "long_answer_loss": 0.1705, "loss": 0.173, "short_answer_loss": NaN, "step": 317, "template_loss": 0.0 }, { "epoch": 0.51, "full_loss": 0.1721, "grad_norm": 1.8828125, "learning_rate": 2.177906758981962e-05, "long_answer_loss": 0.1721, "loss": 0.1692, "short_answer_loss": NaN, "step": 318, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.1687, "grad_norm": 2.015625, "learning_rate": 2.1757072111101994e-05, "long_answer_loss": 0.1687, "loss": 0.1773, "short_answer_loss": NaN, "step": 319, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.1731, "grad_norm": 1.9140625, "learning_rate": 2.1735012973429068e-05, "long_answer_loss": 0.1731, "loss": 0.1768, "short_answer_loss": NaN, "step": 320, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.1698, "grad_norm": 1.9375, "learning_rate": 2.1712890328496927e-05, "long_answer_loss": 0.1698, "loss": 0.1804, "short_answer_loss": NaN, "step": 321, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.1582, "grad_norm": 1.9375, "learning_rate": 2.1690704328438384e-05, "long_answer_loss": 0.1582, "loss": 0.1813, "short_answer_loss": NaN, "step": 322, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.1814, "grad_norm": 1.765625, "learning_rate": 2.1668455125821945e-05, "long_answer_loss": 0.1814, "loss": 0.1804, "short_answer_loss": NaN, "step": 323, "template_loss": 0.0 }, { "epoch": 0.52, "full_loss": 0.2042, "grad_norm": 1.9296875, "learning_rate": 2.1646142873650738e-05, "long_answer_loss": 0.2042, "loss": 0.1828, "short_answer_loss": NaN, "step": 324, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1851, "grad_norm": 1.828125, "learning_rate": 2.1623767725361466e-05, "long_answer_loss": 0.1851, "loss": 0.1888, "short_answer_loss": NaN, "step": 325, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1432, "grad_norm": 1.7734375, "learning_rate": 2.160132983482336e-05, "long_answer_loss": 0.1432, "loss": 0.1752, "short_answer_loss": NaN, "step": 326, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1712, "grad_norm": 1.8046875, "learning_rate": 2.157882935633712e-05, "long_answer_loss": 0.1712, "loss": 0.1735, "short_answer_loss": NaN, "step": 327, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1751, "grad_norm": 1.828125, "learning_rate": 2.1556266444633845e-05, "long_answer_loss": 0.1751, "loss": 0.1697, "short_answer_loss": NaN, "step": 328, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1779, "grad_norm": 1.7578125, "learning_rate": 2.153364125487397e-05, "long_answer_loss": 0.1779, "loss": 0.1869, "short_answer_loss": NaN, "step": 329, "template_loss": 0.0 }, { "epoch": 0.53, "full_loss": 0.1904, "grad_norm": 1.8046875, "learning_rate": 2.1510953942646215e-05, "long_answer_loss": 0.1904, "loss": 0.1821, "short_answer_loss": NaN, "step": 330, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1818, "grad_norm": 1.859375, "learning_rate": 2.1488204663966498e-05, "long_answer_loss": 0.1818, "loss": 0.1789, "short_answer_loss": NaN, "step": 331, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1572, "grad_norm": 1.6953125, "learning_rate": 2.1465393575276867e-05, "long_answer_loss": 0.1572, "loss": 0.1721, "short_answer_loss": NaN, "step": 332, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1797, "grad_norm": 1.640625, "learning_rate": 2.1442520833444416e-05, "long_answer_loss": 0.1797, "loss": 0.1697, "short_answer_loss": NaN, "step": 333, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1511, "grad_norm": 1.7421875, "learning_rate": 2.1419586595760226e-05, "long_answer_loss": 0.1511, "loss": 0.1778, "short_answer_loss": NaN, "step": 334, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1576, "grad_norm": 1.6875, "learning_rate": 2.1396591019938278e-05, "long_answer_loss": 0.1576, "loss": 0.1691, "short_answer_loss": NaN, "step": 335, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1842, "grad_norm": 1.5859375, "learning_rate": 2.1373534264114344e-05, "long_answer_loss": 0.1842, "loss": 0.1698, "short_answer_loss": NaN, "step": 336, "template_loss": 0.0 }, { "epoch": 0.54, "full_loss": 0.1586, "grad_norm": 1.734375, "learning_rate": 2.1350416486844928e-05, "long_answer_loss": 0.1586, "loss": 0.1704, "short_answer_loss": NaN, "step": 337, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.1903, "grad_norm": 1.7109375, "learning_rate": 2.1327237847106167e-05, "long_answer_loss": 0.1903, "loss": 0.1772, "short_answer_loss": NaN, "step": 338, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.161, "grad_norm": 1.6328125, "learning_rate": 2.130399850429274e-05, "long_answer_loss": 0.161, "loss": 0.1591, "short_answer_loss": NaN, "step": 339, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.1558, "grad_norm": 1.78125, "learning_rate": 2.1280698618216757e-05, "long_answer_loss": 0.1558, "loss": 0.1735, "short_answer_loss": NaN, "step": 340, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.1741, "grad_norm": 1.8203125, "learning_rate": 2.125733834910668e-05, "long_answer_loss": 0.1741, "loss": 0.169, "short_answer_loss": NaN, "step": 341, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.1632, "grad_norm": 1.71875, "learning_rate": 2.1233917857606212e-05, "long_answer_loss": 0.1632, "loss": 0.1762, "short_answer_loss": NaN, "step": 342, "template_loss": 0.0 }, { "epoch": 0.55, "full_loss": 0.1648, "grad_norm": 1.7734375, "learning_rate": 2.1210437304773185e-05, "long_answer_loss": 0.1648, "loss": 0.1732, "short_answer_loss": NaN, "step": 343, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.1699, "grad_norm": 1.765625, "learning_rate": 2.1186896852078476e-05, "long_answer_loss": 0.1699, "loss": 0.176, "short_answer_loss": NaN, "step": 344, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.1685, "grad_norm": 1.796875, "learning_rate": 2.1163296661404864e-05, "long_answer_loss": 0.1685, "loss": 0.1694, "short_answer_loss": NaN, "step": 345, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.1652, "grad_norm": 1.8125, "learning_rate": 2.113963689504594e-05, "long_answer_loss": 0.1652, "loss": 0.167, "short_answer_loss": NaN, "step": 346, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.1597, "grad_norm": 1.7734375, "learning_rate": 2.111591771570499e-05, "long_answer_loss": 0.1597, "loss": 0.1695, "short_answer_loss": NaN, "step": 347, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.2015, "grad_norm": 1.7734375, "learning_rate": 2.1092139286493866e-05, "long_answer_loss": 0.2015, "loss": 0.1803, "short_answer_loss": NaN, "step": 348, "template_loss": 0.0 }, { "epoch": 0.56, "full_loss": 0.1709, "grad_norm": 1.90625, "learning_rate": 2.106830177093187e-05, "long_answer_loss": 0.1709, "loss": 0.1764, "short_answer_loss": NaN, "step": 349, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.2019, "grad_norm": 1.703125, "learning_rate": 2.104440533294462e-05, "long_answer_loss": 0.2019, "loss": 0.1725, "short_answer_loss": NaN, "step": 350, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.1753, "grad_norm": 1.6875, "learning_rate": 2.1020450136862953e-05, "long_answer_loss": 0.1753, "loss": 0.173, "short_answer_loss": NaN, "step": 351, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.18, "grad_norm": 1.7265625, "learning_rate": 2.0996436347421744e-05, "long_answer_loss": 0.18, "loss": 0.1768, "short_answer_loss": NaN, "step": 352, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.1674, "grad_norm": 1.640625, "learning_rate": 2.0972364129758825e-05, "long_answer_loss": 0.1674, "loss": 0.1755, "short_answer_loss": NaN, "step": 353, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.1793, "grad_norm": 1.7734375, "learning_rate": 2.0948233649413815e-05, "long_answer_loss": 0.1793, "loss": 0.1821, "short_answer_loss": NaN, "step": 354, "template_loss": 0.0 }, { "epoch": 0.57, "full_loss": 0.1591, "grad_norm": 1.78125, "learning_rate": 2.0924045072327003e-05, "long_answer_loss": 0.1591, "loss": 0.1628, "short_answer_loss": NaN, "step": 355, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.1624, "grad_norm": 1.640625, "learning_rate": 2.089979856483819e-05, "long_answer_loss": 0.1624, "loss": 0.1692, "short_answer_loss": NaN, "step": 356, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.1785, "grad_norm": 1.765625, "learning_rate": 2.0875494293685548e-05, "long_answer_loss": 0.1785, "loss": 0.1717, "short_answer_loss": NaN, "step": 357, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.1436, "grad_norm": 1.9375, "learning_rate": 2.0851132426004492e-05, "long_answer_loss": 0.1436, "loss": 0.1697, "short_answer_loss": NaN, "step": 358, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.1671, "grad_norm": 1.8203125, "learning_rate": 2.082671312932651e-05, "long_answer_loss": 0.1671, "loss": 0.1679, "short_answer_loss": NaN, "step": 359, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.1741, "grad_norm": 1.8359375, "learning_rate": 2.0802236571578e-05, "long_answer_loss": 0.1741, "loss": 0.1639, "short_answer_loss": NaN, "step": 360, "template_loss": 0.0 }, { "epoch": 0.58, "full_loss": 0.179, "grad_norm": 1.8046875, "learning_rate": 2.0777702921079163e-05, "long_answer_loss": 0.179, "loss": 0.1703, "short_answer_loss": NaN, "step": 361, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1563, "grad_norm": 1.90625, "learning_rate": 2.075311234654279e-05, "long_answer_loss": 0.1563, "loss": 0.1645, "short_answer_loss": NaN, "step": 362, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1723, "grad_norm": 1.7890625, "learning_rate": 2.072846501707314e-05, "long_answer_loss": 0.1723, "loss": 0.1693, "short_answer_loss": NaN, "step": 363, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1928, "grad_norm": 1.6796875, "learning_rate": 2.0703761102164764e-05, "long_answer_loss": 0.1928, "loss": 0.164, "short_answer_loss": NaN, "step": 364, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1731, "grad_norm": 1.7890625, "learning_rate": 2.0679000771701326e-05, "long_answer_loss": 0.1731, "loss": 0.1676, "short_answer_loss": NaN, "step": 365, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1856, "grad_norm": 1.8515625, "learning_rate": 2.0654184195954465e-05, "long_answer_loss": 0.1856, "loss": 0.1661, "short_answer_loss": NaN, "step": 366, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.1614, "grad_norm": 1.6875, "learning_rate": 2.0629311545582598e-05, "long_answer_loss": 0.1614, "loss": 0.1805, "short_answer_loss": NaN, "step": 367, "template_loss": 0.0 }, { "epoch": 0.59, "full_loss": 0.172, "grad_norm": 1.71875, "learning_rate": 2.0604382991629755e-05, "long_answer_loss": 0.172, "loss": 0.1736, "short_answer_loss": NaN, "step": 368, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1353, "grad_norm": 1.875, "learning_rate": 2.057939870552441e-05, "long_answer_loss": 0.1353, "loss": 0.1733, "short_answer_loss": NaN, "step": 369, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1488, "grad_norm": 1.75, "learning_rate": 2.0554358859078284e-05, "long_answer_loss": 0.1488, "loss": 0.1699, "short_answer_loss": NaN, "step": 370, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1754, "grad_norm": 1.6484375, "learning_rate": 2.0529263624485183e-05, "long_answer_loss": 0.1754, "loss": 0.1711, "short_answer_loss": NaN, "step": 371, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1752, "grad_norm": 1.8984375, "learning_rate": 2.0504113174319812e-05, "long_answer_loss": 0.1752, "loss": 0.1788, "short_answer_loss": NaN, "step": 372, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1742, "grad_norm": 1.75, "learning_rate": 2.0478907681536564e-05, "long_answer_loss": 0.1742, "loss": 0.1839, "short_answer_loss": NaN, "step": 373, "template_loss": 0.0 }, { "epoch": 0.6, "full_loss": 0.1508, "grad_norm": 1.71875, "learning_rate": 2.0453647319468368e-05, "long_answer_loss": 0.1508, "loss": 0.1696, "short_answer_loss": NaN, "step": 374, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.1675, "grad_norm": 1.7265625, "learning_rate": 2.0428332261825456e-05, "long_answer_loss": 0.1675, "loss": 0.1737, "short_answer_loss": NaN, "step": 375, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.1773, "grad_norm": 1.8984375, "learning_rate": 2.0402962682694214e-05, "long_answer_loss": 0.1773, "loss": 0.1843, "short_answer_loss": NaN, "step": 376, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.19, "grad_norm": 1.9375, "learning_rate": 2.0377538756535947e-05, "long_answer_loss": 0.19, "loss": 0.1772, "short_answer_loss": NaN, "step": 377, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.1942, "grad_norm": 1.9140625, "learning_rate": 2.0352060658185696e-05, "long_answer_loss": 0.1942, "loss": 0.1824, "short_answer_loss": NaN, "step": 378, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.1905, "grad_norm": 1.9140625, "learning_rate": 2.0326528562851028e-05, "long_answer_loss": 0.1905, "loss": 0.1717, "short_answer_loss": NaN, "step": 379, "template_loss": 0.0 }, { "epoch": 0.61, "full_loss": 0.1579, "grad_norm": 1.84375, "learning_rate": 2.030094264611084e-05, "long_answer_loss": 0.1579, "loss": 0.1621, "short_answer_loss": NaN, "step": 380, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.1883, "grad_norm": 1.796875, "learning_rate": 2.027530308391416e-05, "long_answer_loss": 0.1883, "loss": 0.1653, "short_answer_loss": NaN, "step": 381, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.1601, "grad_norm": 1.7890625, "learning_rate": 2.02496100525789e-05, "long_answer_loss": 0.1601, "loss": 0.1722, "short_answer_loss": NaN, "step": 382, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.1761, "grad_norm": 1.828125, "learning_rate": 2.0223863728790682e-05, "long_answer_loss": 0.1761, "loss": 0.1696, "short_answer_loss": NaN, "step": 383, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.192, "grad_norm": 1.78125, "learning_rate": 2.0198064289601615e-05, "long_answer_loss": 0.192, "loss": 0.166, "short_answer_loss": NaN, "step": 384, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.2026, "grad_norm": 1.890625, "learning_rate": 2.017221191242906e-05, "long_answer_loss": 0.2026, "loss": 0.1795, "short_answer_loss": NaN, "step": 385, "template_loss": 0.0 }, { "epoch": 0.62, "full_loss": 0.1869, "grad_norm": 1.8359375, "learning_rate": 2.014630677505443e-05, "long_answer_loss": 0.1869, "loss": 0.1652, "short_answer_loss": NaN, "step": 386, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1679, "grad_norm": 1.625, "learning_rate": 2.0120349055621952e-05, "long_answer_loss": 0.1679, "loss": 0.173, "short_answer_loss": NaN, "step": 387, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1758, "grad_norm": 1.671875, "learning_rate": 2.0094338932637447e-05, "long_answer_loss": 0.1758, "loss": 0.1693, "short_answer_loss": NaN, "step": 388, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1879, "grad_norm": 1.671875, "learning_rate": 2.0068276584967113e-05, "long_answer_loss": 0.1879, "loss": 0.1621, "short_answer_loss": NaN, "step": 389, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1635, "grad_norm": 1.765625, "learning_rate": 2.0042162191836285e-05, "long_answer_loss": 0.1635, "loss": 0.1748, "short_answer_loss": NaN, "step": 390, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1437, "grad_norm": 1.7890625, "learning_rate": 2.00159959328282e-05, "long_answer_loss": 0.1437, "loss": 0.1693, "short_answer_loss": NaN, "step": 391, "template_loss": 0.0 }, { "epoch": 0.63, "full_loss": 0.1727, "grad_norm": 1.71875, "learning_rate": 1.9989777987882763e-05, "long_answer_loss": 0.1727, "loss": 0.1669, "short_answer_loss": NaN, "step": 392, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.1738, "grad_norm": 1.75, "learning_rate": 1.996350853729532e-05, "long_answer_loss": 0.1738, "loss": 0.1747, "short_answer_loss": NaN, "step": 393, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.1698, "grad_norm": 1.796875, "learning_rate": 1.993718776171541e-05, "long_answer_loss": 0.1698, "loss": 0.1653, "short_answer_loss": NaN, "step": 394, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.1624, "grad_norm": 1.796875, "learning_rate": 1.9910815842145513e-05, "long_answer_loss": 0.1624, "loss": 0.1777, "short_answer_loss": NaN, "step": 395, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.1493, "grad_norm": 1.8203125, "learning_rate": 1.9884392959939824e-05, "long_answer_loss": 0.1493, "loss": 0.1654, "short_answer_loss": NaN, "step": 396, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.166, "grad_norm": 1.7109375, "learning_rate": 1.9857919296803e-05, "long_answer_loss": 0.166, "loss": 0.1705, "short_answer_loss": NaN, "step": 397, "template_loss": 0.0 }, { "epoch": 0.64, "full_loss": 0.1814, "grad_norm": 1.890625, "learning_rate": 1.9831395034788904e-05, "long_answer_loss": 0.1814, "loss": 0.1733, "short_answer_loss": NaN, "step": 398, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.161, "grad_norm": 1.90625, "learning_rate": 1.9804820356299356e-05, "long_answer_loss": 0.161, "loss": 0.1688, "short_answer_loss": NaN, "step": 399, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1599, "grad_norm": 1.703125, "learning_rate": 1.9778195444082877e-05, "long_answer_loss": 0.1599, "loss": 0.1657, "short_answer_loss": NaN, "step": 400, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1777, "grad_norm": 1.8828125, "learning_rate": 1.9751520481233445e-05, "long_answer_loss": 0.1777, "loss": 0.1851, "short_answer_loss": NaN, "step": 401, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1591, "grad_norm": 1.6796875, "learning_rate": 1.9724795651189214e-05, "long_answer_loss": 0.1591, "loss": 0.1582, "short_answer_loss": NaN, "step": 402, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1798, "grad_norm": 1.6875, "learning_rate": 1.969802113773127e-05, "long_answer_loss": 0.1798, "loss": 0.1707, "short_answer_loss": NaN, "step": 403, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1611, "grad_norm": 1.75, "learning_rate": 1.967119712498236e-05, "long_answer_loss": 0.1611, "loss": 0.1632, "short_answer_loss": NaN, "step": 404, "template_loss": 0.0 }, { "epoch": 0.65, "full_loss": 0.1505, "grad_norm": 1.6953125, "learning_rate": 1.9644323797405633e-05, "long_answer_loss": 0.1505, "loss": 0.1616, "short_answer_loss": NaN, "step": 405, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.2129, "grad_norm": 1.625, "learning_rate": 1.961740133980336e-05, "long_answer_loss": 0.2129, "loss": 0.1712, "short_answer_loss": NaN, "step": 406, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.1631, "grad_norm": 1.65625, "learning_rate": 1.959042993731567e-05, "long_answer_loss": 0.1631, "loss": 0.1586, "short_answer_loss": NaN, "step": 407, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.1718, "grad_norm": 1.671875, "learning_rate": 1.956340977541927e-05, "long_answer_loss": 0.1718, "loss": 0.1687, "short_answer_loss": NaN, "step": 408, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.1629, "grad_norm": 1.6875, "learning_rate": 1.9536341039926186e-05, "long_answer_loss": 0.1629, "loss": 0.1702, "short_answer_loss": NaN, "step": 409, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.1599, "grad_norm": 1.6953125, "learning_rate": 1.9509223916982472e-05, "long_answer_loss": 0.1599, "loss": 0.1605, "short_answer_loss": NaN, "step": 410, "template_loss": 0.0 }, { "epoch": 0.66, "full_loss": 0.1586, "grad_norm": 1.703125, "learning_rate": 1.9482058593066923e-05, "long_answer_loss": 0.1586, "loss": 0.171, "short_answer_loss": NaN, "step": 411, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.1573, "grad_norm": 1.7421875, "learning_rate": 1.9454845254989818e-05, "long_answer_loss": 0.1573, "loss": 0.1675, "short_answer_loss": NaN, "step": 412, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.177, "grad_norm": 1.8125, "learning_rate": 1.9427584089891598e-05, "long_answer_loss": 0.177, "loss": 0.1752, "short_answer_loss": NaN, "step": 413, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.1527, "grad_norm": 1.5625, "learning_rate": 1.9400275285241624e-05, "long_answer_loss": 0.1527, "loss": 0.1598, "short_answer_loss": NaN, "step": 414, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.1577, "grad_norm": 1.7890625, "learning_rate": 1.9372919028836855e-05, "long_answer_loss": 0.1577, "loss": 0.1734, "short_answer_loss": NaN, "step": 415, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.183, "grad_norm": 1.703125, "learning_rate": 1.9345515508800556e-05, "long_answer_loss": 0.183, "loss": 0.1676, "short_answer_loss": NaN, "step": 416, "template_loss": 0.0 }, { "epoch": 0.67, "full_loss": 0.1698, "grad_norm": 1.7578125, "learning_rate": 1.931806491358102e-05, "long_answer_loss": 0.1698, "loss": 0.1672, "short_answer_loss": NaN, "step": 417, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1583, "grad_norm": 1.8828125, "learning_rate": 1.929056743195028e-05, "long_answer_loss": 0.1583, "loss": 0.172, "short_answer_loss": NaN, "step": 418, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1488, "grad_norm": 1.8125, "learning_rate": 1.9263023253002773e-05, "long_answer_loss": 0.1488, "loss": 0.1674, "short_answer_loss": NaN, "step": 419, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1781, "grad_norm": 1.7734375, "learning_rate": 1.9235432566154084e-05, "long_answer_loss": 0.1781, "loss": 0.1593, "short_answer_loss": NaN, "step": 420, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1786, "grad_norm": 1.6953125, "learning_rate": 1.9207795561139614e-05, "long_answer_loss": 0.1786, "loss": 0.1674, "short_answer_loss": NaN, "step": 421, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1826, "grad_norm": 1.734375, "learning_rate": 1.9180112428013286e-05, "long_answer_loss": 0.1826, "loss": 0.1615, "short_answer_loss": NaN, "step": 422, "template_loss": 0.0 }, { "epoch": 0.68, "full_loss": 0.1832, "grad_norm": 1.765625, "learning_rate": 1.915238335714623e-05, "long_answer_loss": 0.1832, "loss": 0.1728, "short_answer_loss": NaN, "step": 423, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.1843, "grad_norm": 1.8046875, "learning_rate": 1.9124608539225496e-05, "long_answer_loss": 0.1843, "loss": 0.1677, "short_answer_loss": NaN, "step": 424, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.169, "grad_norm": 1.71875, "learning_rate": 1.909678816525271e-05, "long_answer_loss": 0.169, "loss": 0.1727, "short_answer_loss": NaN, "step": 425, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.1327, "grad_norm": 1.625, "learning_rate": 1.9068922426542783e-05, "long_answer_loss": 0.1327, "loss": 0.1565, "short_answer_loss": NaN, "step": 426, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.1685, "grad_norm": 1.71875, "learning_rate": 1.9041011514722602e-05, "long_answer_loss": 0.1685, "loss": 0.1745, "short_answer_loss": NaN, "step": 427, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.1876, "grad_norm": 1.7578125, "learning_rate": 1.901305562172968e-05, "long_answer_loss": 0.1876, "loss": 0.1722, "short_answer_loss": NaN, "step": 428, "template_loss": 0.0 }, { "epoch": 0.69, "full_loss": 0.1595, "grad_norm": 1.84375, "learning_rate": 1.898505493981087e-05, "long_answer_loss": 0.1595, "loss": 0.1667, "short_answer_loss": NaN, "step": 429, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1656, "grad_norm": 1.609375, "learning_rate": 1.895700966152103e-05, "long_answer_loss": 0.1656, "loss": 0.1568, "short_answer_loss": NaN, "step": 430, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1723, "grad_norm": 1.7265625, "learning_rate": 1.8928919979721678e-05, "long_answer_loss": 0.1723, "loss": 0.1661, "short_answer_loss": NaN, "step": 431, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1612, "grad_norm": 1.703125, "learning_rate": 1.8900786087579712e-05, "long_answer_loss": 0.1612, "loss": 0.1571, "short_answer_loss": NaN, "step": 432, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1496, "grad_norm": 1.765625, "learning_rate": 1.8872608178566043e-05, "long_answer_loss": 0.1496, "loss": 0.1661, "short_answer_loss": NaN, "step": 433, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1647, "grad_norm": 1.8125, "learning_rate": 1.8844386446454275e-05, "long_answer_loss": 0.1647, "loss": 0.1686, "short_answer_loss": NaN, "step": 434, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1544, "grad_norm": 1.890625, "learning_rate": 1.881612108531938e-05, "long_answer_loss": 0.1544, "loss": 0.1683, "short_answer_loss": NaN, "step": 435, "template_loss": 0.0 }, { "epoch": 0.7, "full_loss": 0.1494, "grad_norm": 1.6875, "learning_rate": 1.878781228953635e-05, "long_answer_loss": 0.1494, "loss": 0.1597, "short_answer_loss": NaN, "step": 436, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.171, "grad_norm": 1.71875, "learning_rate": 1.8759460253778877e-05, "long_answer_loss": 0.171, "loss": 0.1691, "short_answer_loss": NaN, "step": 437, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.1657, "grad_norm": 1.5625, "learning_rate": 1.8731065173018e-05, "long_answer_loss": 0.1657, "loss": 0.1576, "short_answer_loss": NaN, "step": 438, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.17, "grad_norm": 1.8515625, "learning_rate": 1.870262724252077e-05, "long_answer_loss": 0.17, "loss": 0.1781, "short_answer_loss": NaN, "step": 439, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.1849, "grad_norm": 1.8359375, "learning_rate": 1.8674146657848908e-05, "long_answer_loss": 0.1849, "loss": 0.1736, "short_answer_loss": NaN, "step": 440, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.1792, "grad_norm": 1.6875, "learning_rate": 1.8645623614857455e-05, "long_answer_loss": 0.1792, "loss": 0.1701, "short_answer_loss": NaN, "step": 441, "template_loss": 0.0 }, { "epoch": 0.71, "full_loss": 0.2047, "grad_norm": 1.7890625, "learning_rate": 1.8617058309693437e-05, "long_answer_loss": 0.2047, "loss": 0.1788, "short_answer_loss": NaN, "step": 442, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.163, "grad_norm": 1.71875, "learning_rate": 1.85884509387945e-05, "long_answer_loss": 0.163, "loss": 0.1705, "short_answer_loss": NaN, "step": 443, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.1636, "grad_norm": 1.7421875, "learning_rate": 1.855980169888757e-05, "long_answer_loss": 0.1636, "loss": 0.1819, "short_answer_loss": NaN, "step": 444, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.1581, "grad_norm": 1.6953125, "learning_rate": 1.85311107869875e-05, "long_answer_loss": 0.1581, "loss": 0.1666, "short_answer_loss": NaN, "step": 445, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.1779, "grad_norm": 1.8125, "learning_rate": 1.850237840039571e-05, "long_answer_loss": 0.1779, "loss": 0.1726, "short_answer_loss": NaN, "step": 446, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.1499, "grad_norm": 1.7109375, "learning_rate": 1.8473604736698835e-05, "long_answer_loss": 0.1499, "loss": 0.1626, "short_answer_loss": NaN, "step": 447, "template_loss": 0.0 }, { "epoch": 0.72, "full_loss": 0.171, "grad_norm": 1.65625, "learning_rate": 1.844478999376736e-05, "long_answer_loss": 0.171, "loss": 0.1553, "short_answer_loss": NaN, "step": 448, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1674, "grad_norm": 1.6953125, "learning_rate": 1.841593436975427e-05, "long_answer_loss": 0.1674, "loss": 0.1701, "short_answer_loss": NaN, "step": 449, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1619, "grad_norm": 1.7265625, "learning_rate": 1.838703806309367e-05, "long_answer_loss": 0.1619, "loss": 0.163, "short_answer_loss": NaN, "step": 450, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1672, "grad_norm": 1.734375, "learning_rate": 1.8358101272499443e-05, "long_answer_loss": 0.1672, "loss": 0.1597, "short_answer_loss": NaN, "step": 451, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1467, "grad_norm": 1.7109375, "learning_rate": 1.8329124196963864e-05, "long_answer_loss": 0.1467, "loss": 0.1748, "short_answer_loss": NaN, "step": 452, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1644, "grad_norm": 1.734375, "learning_rate": 1.830010703575624e-05, "long_answer_loss": 0.1644, "loss": 0.1643, "short_answer_loss": NaN, "step": 453, "template_loss": 0.0 }, { "epoch": 0.73, "full_loss": 0.1672, "grad_norm": 1.6640625, "learning_rate": 1.827104998842154e-05, "long_answer_loss": 0.1672, "loss": 0.1646, "short_answer_loss": NaN, "step": 454, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1646, "grad_norm": 1.8359375, "learning_rate": 1.8241953254779027e-05, "long_answer_loss": 0.1646, "loss": 0.1643, "short_answer_loss": NaN, "step": 455, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1869, "grad_norm": 1.78125, "learning_rate": 1.8212817034920864e-05, "long_answer_loss": 0.1869, "loss": 0.1684, "short_answer_loss": NaN, "step": 456, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1618, "grad_norm": 1.8046875, "learning_rate": 1.818364152921077e-05, "long_answer_loss": 0.1618, "loss": 0.1577, "short_answer_loss": NaN, "step": 457, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1658, "grad_norm": 1.8046875, "learning_rate": 1.8154426938282615e-05, "long_answer_loss": 0.1658, "loss": 0.1688, "short_answer_loss": NaN, "step": 458, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1681, "grad_norm": 1.6640625, "learning_rate": 1.8125173463039048e-05, "long_answer_loss": 0.1681, "loss": 0.1663, "short_answer_loss": NaN, "step": 459, "template_loss": 0.0 }, { "epoch": 0.74, "full_loss": 0.1868, "grad_norm": 1.734375, "learning_rate": 1.8095881304650123e-05, "long_answer_loss": 0.1868, "loss": 0.1678, "short_answer_loss": NaN, "step": 460, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1519, "grad_norm": 1.703125, "learning_rate": 1.8066550664551904e-05, "long_answer_loss": 0.1519, "loss": 0.1604, "short_answer_loss": NaN, "step": 461, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1849, "grad_norm": 1.734375, "learning_rate": 1.8037181744445093e-05, "long_answer_loss": 0.1849, "loss": 0.1701, "short_answer_loss": NaN, "step": 462, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1386, "grad_norm": 1.625, "learning_rate": 1.8007774746293628e-05, "long_answer_loss": 0.1386, "loss": 0.1604, "short_answer_loss": NaN, "step": 463, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1884, "grad_norm": 1.7734375, "learning_rate": 1.7978329872323308e-05, "long_answer_loss": 0.1884, "loss": 0.16, "short_answer_loss": NaN, "step": 464, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1752, "grad_norm": 1.8671875, "learning_rate": 1.7948847325020394e-05, "long_answer_loss": 0.1752, "loss": 0.1646, "short_answer_loss": NaN, "step": 465, "template_loss": 0.0 }, { "epoch": 0.75, "full_loss": 0.1523, "grad_norm": 1.7109375, "learning_rate": 1.7919327307130217e-05, "long_answer_loss": 0.1523, "loss": 0.1599, "short_answer_loss": NaN, "step": 466, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.1735, "grad_norm": 1.8125, "learning_rate": 1.7889770021655787e-05, "long_answer_loss": 0.1735, "loss": 0.1826, "short_answer_loss": NaN, "step": 467, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.16, "grad_norm": 1.828125, "learning_rate": 1.78601756718564e-05, "long_answer_loss": 0.16, "loss": 0.1615, "short_answer_loss": NaN, "step": 468, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.1622, "grad_norm": 1.6953125, "learning_rate": 1.783054446124622e-05, "long_answer_loss": 0.1622, "loss": 0.1562, "short_answer_loss": NaN, "step": 469, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.1735, "grad_norm": 1.796875, "learning_rate": 1.7800876593592912e-05, "long_answer_loss": 0.1735, "loss": 0.1663, "short_answer_loss": NaN, "step": 470, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.148, "grad_norm": 1.7109375, "learning_rate": 1.777117227291622e-05, "long_answer_loss": 0.148, "loss": 0.167, "short_answer_loss": NaN, "step": 471, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.1756, "grad_norm": 1.7578125, "learning_rate": 1.7741431703486562e-05, "long_answer_loss": 0.1756, "loss": 0.1616, "short_answer_loss": NaN, "step": 472, "template_loss": 0.0 }, { "epoch": 0.76, "full_loss": 0.1404, "grad_norm": 1.75, "learning_rate": 1.7711655089823638e-05, "long_answer_loss": 0.1404, "loss": 0.1633, "short_answer_loss": NaN, "step": 473, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1554, "grad_norm": 1.859375, "learning_rate": 1.7681842636695007e-05, "long_answer_loss": 0.1554, "loss": 0.1644, "short_answer_loss": NaN, "step": 474, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1773, "grad_norm": 1.859375, "learning_rate": 1.7651994549114702e-05, "long_answer_loss": 0.1773, "loss": 0.1651, "short_answer_loss": NaN, "step": 475, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1536, "grad_norm": 1.5390625, "learning_rate": 1.7622111032341797e-05, "long_answer_loss": 0.1536, "loss": 0.1628, "short_answer_loss": NaN, "step": 476, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1713, "grad_norm": 1.8359375, "learning_rate": 1.7592192291879008e-05, "long_answer_loss": 0.1713, "loss": 0.1678, "short_answer_loss": NaN, "step": 477, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1917, "grad_norm": 1.65625, "learning_rate": 1.756223853347127e-05, "long_answer_loss": 0.1917, "loss": 0.1679, "short_answer_loss": NaN, "step": 478, "template_loss": 0.0 }, { "epoch": 0.77, "full_loss": 0.1724, "grad_norm": 1.6875, "learning_rate": 1.7532249963104344e-05, "long_answer_loss": 0.1724, "loss": 0.167, "short_answer_loss": NaN, "step": 479, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1845, "grad_norm": 1.765625, "learning_rate": 1.7502226787003378e-05, "long_answer_loss": 0.1845, "loss": 0.1716, "short_answer_loss": NaN, "step": 480, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1741, "grad_norm": 1.78125, "learning_rate": 1.747216921163149e-05, "long_answer_loss": 0.1741, "loss": 0.1599, "short_answer_loss": NaN, "step": 481, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1524, "grad_norm": 1.7265625, "learning_rate": 1.7442077443688364e-05, "long_answer_loss": 0.1524, "loss": 0.1597, "short_answer_loss": NaN, "step": 482, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1621, "grad_norm": 1.5859375, "learning_rate": 1.741195169010882e-05, "long_answer_loss": 0.1621, "loss": 0.1614, "short_answer_loss": NaN, "step": 483, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1449, "grad_norm": 1.7421875, "learning_rate": 1.7381792158061378e-05, "long_answer_loss": 0.1449, "loss": 0.1676, "short_answer_loss": NaN, "step": 484, "template_loss": 0.0 }, { "epoch": 0.78, "full_loss": 0.1858, "grad_norm": 1.671875, "learning_rate": 1.7351599054946853e-05, "long_answer_loss": 0.1858, "loss": 0.1664, "short_answer_loss": NaN, "step": 485, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.1459, "grad_norm": 1.625, "learning_rate": 1.732137258839693e-05, "long_answer_loss": 0.1459, "loss": 0.1635, "short_answer_loss": NaN, "step": 486, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.1591, "grad_norm": 1.6484375, "learning_rate": 1.7291112966272707e-05, "long_answer_loss": 0.1591, "loss": 0.1481, "short_answer_loss": NaN, "step": 487, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.1824, "grad_norm": 1.6953125, "learning_rate": 1.7260820396663307e-05, "long_answer_loss": 0.1824, "loss": 0.1691, "short_answer_loss": NaN, "step": 488, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.2103, "grad_norm": 1.7265625, "learning_rate": 1.723049508788442e-05, "long_answer_loss": 0.2103, "loss": 0.1607, "short_answer_loss": NaN, "step": 489, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.1865, "grad_norm": 1.6796875, "learning_rate": 1.720013724847686e-05, "long_answer_loss": 0.1865, "loss": 0.1544, "short_answer_loss": NaN, "step": 490, "template_loss": 0.0 }, { "epoch": 0.79, "full_loss": 0.1951, "grad_norm": 1.78125, "learning_rate": 1.716974708720517e-05, "long_answer_loss": 0.1951, "loss": 0.1694, "short_answer_loss": NaN, "step": 491, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1457, "grad_norm": 1.734375, "learning_rate": 1.7139324813056155e-05, "long_answer_loss": 0.1457, "loss": 0.1615, "short_answer_loss": NaN, "step": 492, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1946, "grad_norm": 1.6015625, "learning_rate": 1.7108870635237444e-05, "long_answer_loss": 0.1946, "loss": 0.1607, "short_answer_loss": NaN, "step": 493, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1513, "grad_norm": 1.7578125, "learning_rate": 1.7078384763176084e-05, "long_answer_loss": 0.1513, "loss": 0.1649, "short_answer_loss": NaN, "step": 494, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1478, "grad_norm": 1.7265625, "learning_rate": 1.7047867406517047e-05, "long_answer_loss": 0.1478, "loss": 0.1599, "short_answer_loss": NaN, "step": 495, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1656, "grad_norm": 1.7890625, "learning_rate": 1.7017318775121845e-05, "long_answer_loss": 0.1656, "loss": 0.1581, "short_answer_loss": NaN, "step": 496, "template_loss": 0.0 }, { "epoch": 0.8, "full_loss": 0.1545, "grad_norm": 1.6484375, "learning_rate": 1.6986739079067047e-05, "long_answer_loss": 0.1545, "loss": 0.1649, "short_answer_loss": NaN, "step": 497, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.155, "grad_norm": 1.7421875, "learning_rate": 1.6956128528642842e-05, "long_answer_loss": 0.155, "loss": 0.1563, "short_answer_loss": NaN, "step": 498, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.1534, "grad_norm": 1.703125, "learning_rate": 1.6925487334351613e-05, "long_answer_loss": 0.1534, "loss": 0.1636, "short_answer_loss": NaN, "step": 499, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.169, "grad_norm": 1.7890625, "learning_rate": 1.6894815706906458e-05, "long_answer_loss": 0.169, "loss": 0.1567, "short_answer_loss": NaN, "step": 500, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.168, "grad_norm": 1.7734375, "learning_rate": 1.686411385722977e-05, "long_answer_loss": 0.168, "loss": 0.16, "short_answer_loss": NaN, "step": 501, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.1441, "grad_norm": 1.7890625, "learning_rate": 1.683338199645177e-05, "long_answer_loss": 0.1441, "loss": 0.1669, "short_answer_loss": NaN, "step": 502, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.1572, "grad_norm": 1.7265625, "learning_rate": 1.6802620335909054e-05, "long_answer_loss": 0.1572, "loss": 0.1643, "short_answer_loss": NaN, "step": 503, "template_loss": 0.0 }, { "epoch": 0.81, "full_loss": 0.1503, "grad_norm": 1.7578125, "learning_rate": 1.6771829087143156e-05, "long_answer_loss": 0.1503, "loss": 0.1601, "short_answer_loss": NaN, "step": 504, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.1735, "grad_norm": 1.6875, "learning_rate": 1.6741008461899073e-05, "long_answer_loss": 0.1735, "loss": 0.1561, "short_answer_loss": NaN, "step": 505, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.183, "grad_norm": 1.7109375, "learning_rate": 1.6710158672123818e-05, "long_answer_loss": 0.183, "loss": 0.1685, "short_answer_loss": NaN, "step": 506, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.16, "grad_norm": 1.7734375, "learning_rate": 1.6679279929964968e-05, "long_answer_loss": 0.16, "loss": 0.1582, "short_answer_loss": NaN, "step": 507, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.1683, "grad_norm": 1.7578125, "learning_rate": 1.6648372447769197e-05, "long_answer_loss": 0.1683, "loss": 0.167, "short_answer_loss": NaN, "step": 508, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.1724, "grad_norm": 1.6640625, "learning_rate": 1.6617436438080812e-05, "long_answer_loss": 0.1724, "loss": 0.16, "short_answer_loss": NaN, "step": 509, "template_loss": 0.0 }, { "epoch": 0.82, "full_loss": 0.1824, "grad_norm": 1.8203125, "learning_rate": 1.6586472113640306e-05, "long_answer_loss": 0.1824, "loss": 0.1644, "short_answer_loss": NaN, "step": 510, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1504, "grad_norm": 1.828125, "learning_rate": 1.6555479687382887e-05, "long_answer_loss": 0.1504, "loss": 0.1576, "short_answer_loss": NaN, "step": 511, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1488, "grad_norm": 1.7421875, "learning_rate": 1.6524459372437004e-05, "long_answer_loss": 0.1488, "loss": 0.1612, "short_answer_loss": NaN, "step": 512, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1622, "grad_norm": 1.6953125, "learning_rate": 1.64934113821229e-05, "long_answer_loss": 0.1622, "loss": 0.1553, "short_answer_loss": NaN, "step": 513, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1501, "grad_norm": 1.7890625, "learning_rate": 1.6462335929951133e-05, "long_answer_loss": 0.1501, "loss": 0.1545, "short_answer_loss": NaN, "step": 514, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1467, "grad_norm": 1.7421875, "learning_rate": 1.643123322962111e-05, "long_answer_loss": 0.1467, "loss": 0.154, "short_answer_loss": NaN, "step": 515, "template_loss": 0.0 }, { "epoch": 0.83, "full_loss": 0.1546, "grad_norm": 1.6953125, "learning_rate": 1.6400103495019618e-05, "long_answer_loss": 0.1546, "loss": 0.1612, "short_answer_loss": NaN, "step": 516, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.1756, "grad_norm": 1.6953125, "learning_rate": 1.6368946940219352e-05, "long_answer_loss": 0.1756, "loss": 0.1622, "short_answer_loss": NaN, "step": 517, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.1914, "grad_norm": 1.75, "learning_rate": 1.633776377947745e-05, "long_answer_loss": 0.1914, "loss": 0.1644, "short_answer_loss": NaN, "step": 518, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.1482, "grad_norm": 1.671875, "learning_rate": 1.6306554227233994e-05, "long_answer_loss": 0.1482, "loss": 0.1614, "short_answer_loss": NaN, "step": 519, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.147, "grad_norm": 1.7890625, "learning_rate": 1.6275318498110585e-05, "long_answer_loss": 0.147, "loss": 0.1617, "short_answer_loss": NaN, "step": 520, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.1683, "grad_norm": 1.8125, "learning_rate": 1.6244056806908816e-05, "long_answer_loss": 0.1683, "loss": 0.1655, "short_answer_loss": NaN, "step": 521, "template_loss": 0.0 }, { "epoch": 0.84, "full_loss": 0.1623, "grad_norm": 1.6875, "learning_rate": 1.621276936860882e-05, "long_answer_loss": 0.1623, "loss": 0.1619, "short_answer_loss": NaN, "step": 522, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.1877, "grad_norm": 1.6953125, "learning_rate": 1.6181456398367788e-05, "long_answer_loss": 0.1877, "loss": 0.1621, "short_answer_loss": NaN, "step": 523, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.1704, "grad_norm": 1.828125, "learning_rate": 1.6150118111518493e-05, "long_answer_loss": 0.1704, "loss": 0.1613, "short_answer_loss": NaN, "step": 524, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.1437, "grad_norm": 1.6640625, "learning_rate": 1.6118754723567798e-05, "long_answer_loss": 0.1437, "loss": 0.1453, "short_answer_loss": NaN, "step": 525, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.1142, "grad_norm": 1.640625, "learning_rate": 1.608736645019518e-05, "long_answer_loss": 0.1142, "loss": 0.1564, "short_answer_loss": NaN, "step": 526, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.1551, "grad_norm": 1.6328125, "learning_rate": 1.605595350725126e-05, "long_answer_loss": 0.1551, "loss": 0.1566, "short_answer_loss": NaN, "step": 527, "template_loss": 0.0 }, { "epoch": 0.85, "full_loss": 0.2099, "grad_norm": 1.765625, "learning_rate": 1.6024516110756296e-05, "long_answer_loss": 0.2099, "loss": 0.1686, "short_answer_loss": NaN, "step": 528, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1506, "grad_norm": 1.7421875, "learning_rate": 1.5993054476898708e-05, "long_answer_loss": 0.1506, "loss": 0.1566, "short_answer_loss": NaN, "step": 529, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1487, "grad_norm": 1.640625, "learning_rate": 1.59615688220336e-05, "long_answer_loss": 0.1487, "loss": 0.1565, "short_answer_loss": NaN, "step": 530, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1496, "grad_norm": 1.71875, "learning_rate": 1.593005936268125e-05, "long_answer_loss": 0.1496, "loss": 0.1567, "short_answer_loss": NaN, "step": 531, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1571, "grad_norm": 1.78125, "learning_rate": 1.5898526315525646e-05, "long_answer_loss": 0.1571, "loss": 0.1597, "short_answer_loss": NaN, "step": 532, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1515, "grad_norm": 1.8515625, "learning_rate": 1.5866969897412984e-05, "long_answer_loss": 0.1515, "loss": 0.1731, "short_answer_loss": NaN, "step": 533, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.1773, "grad_norm": 1.7421875, "learning_rate": 1.583539032535017e-05, "long_answer_loss": 0.1773, "loss": 0.1649, "short_answer_loss": NaN, "step": 534, "template_loss": 0.0 }, { "epoch": 0.86, "full_loss": 0.16, "grad_norm": 1.8125, "learning_rate": 1.5803787816503336e-05, "long_answer_loss": 0.16, "loss": 0.1577, "short_answer_loss": NaN, "step": 535, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.1701, "grad_norm": 1.875, "learning_rate": 1.577216258819635e-05, "long_answer_loss": 0.1701, "loss": 0.1614, "short_answer_loss": NaN, "step": 536, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.137, "grad_norm": 1.7890625, "learning_rate": 1.5740514857909312e-05, "long_answer_loss": 0.137, "loss": 0.1698, "short_answer_loss": NaN, "step": 537, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.1651, "grad_norm": 1.71875, "learning_rate": 1.570884484327707e-05, "long_answer_loss": 0.1651, "loss": 0.1594, "short_answer_loss": NaN, "step": 538, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.132, "grad_norm": 1.828125, "learning_rate": 1.5677152762087714e-05, "long_answer_loss": 0.132, "loss": 0.1594, "short_answer_loss": NaN, "step": 539, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.1649, "grad_norm": 1.7109375, "learning_rate": 1.5645438832281077e-05, "long_answer_loss": 0.1649, "loss": 0.1537, "short_answer_loss": NaN, "step": 540, "template_loss": 0.0 }, { "epoch": 0.87, "full_loss": 0.1636, "grad_norm": 1.6328125, "learning_rate": 1.561370327194725e-05, "long_answer_loss": 0.1636, "loss": 0.1579, "short_answer_loss": NaN, "step": 541, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.1688, "grad_norm": 1.75, "learning_rate": 1.558194629932506e-05, "long_answer_loss": 0.1688, "loss": 0.1685, "short_answer_loss": NaN, "step": 542, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.1594, "grad_norm": 1.7109375, "learning_rate": 1.5550168132800585e-05, "long_answer_loss": 0.1594, "loss": 0.1544, "short_answer_loss": NaN, "step": 543, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.1528, "grad_norm": 1.6875, "learning_rate": 1.5518368990905664e-05, "long_answer_loss": 0.1528, "loss": 0.1653, "short_answer_loss": NaN, "step": 544, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.1369, "grad_norm": 1.6015625, "learning_rate": 1.5486549092316355e-05, "long_answer_loss": 0.1369, "loss": 0.1556, "short_answer_loss": NaN, "step": 545, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.1606, "grad_norm": 1.71875, "learning_rate": 1.545470865585147e-05, "long_answer_loss": 0.1606, "loss": 0.1528, "short_answer_loss": NaN, "step": 546, "template_loss": 0.0 }, { "epoch": 0.88, "full_loss": 0.171, "grad_norm": 1.65625, "learning_rate": 1.5422847900471063e-05, "long_answer_loss": 0.171, "loss": 0.1659, "short_answer_loss": NaN, "step": 547, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.1376, "grad_norm": 1.6875, "learning_rate": 1.53909670452749e-05, "long_answer_loss": 0.1376, "loss": 0.1562, "short_answer_loss": NaN, "step": 548, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.161, "grad_norm": 1.703125, "learning_rate": 1.5359066309500974e-05, "long_answer_loss": 0.161, "loss": 0.1572, "short_answer_loss": NaN, "step": 549, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.1466, "grad_norm": 1.65625, "learning_rate": 1.5327145912524e-05, "long_answer_loss": 0.1466, "loss": 0.1558, "short_answer_loss": NaN, "step": 550, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.1694, "grad_norm": 1.84375, "learning_rate": 1.5295206073853896e-05, "long_answer_loss": 0.1694, "loss": 0.1675, "short_answer_loss": NaN, "step": 551, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.1796, "grad_norm": 1.84375, "learning_rate": 1.526324701313427e-05, "long_answer_loss": 0.1796, "loss": 0.16, "short_answer_loss": NaN, "step": 552, "template_loss": 0.0 }, { "epoch": 0.89, "full_loss": 0.1545, "grad_norm": 1.703125, "learning_rate": 1.5231268950140926e-05, "long_answer_loss": 0.1545, "loss": 0.1545, "short_answer_loss": NaN, "step": 553, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.147, "grad_norm": 1.6796875, "learning_rate": 1.5199272104780332e-05, "long_answer_loss": 0.147, "loss": 0.1467, "short_answer_loss": NaN, "step": 554, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.149, "grad_norm": 1.703125, "learning_rate": 1.5167256697088128e-05, "long_answer_loss": 0.149, "loss": 0.1609, "short_answer_loss": NaN, "step": 555, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.165, "grad_norm": 1.6484375, "learning_rate": 1.5135222947227598e-05, "long_answer_loss": 0.165, "loss": 0.158, "short_answer_loss": NaN, "step": 556, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.1821, "grad_norm": 1.7578125, "learning_rate": 1.510317107548816e-05, "long_answer_loss": 0.1821, "loss": 0.1679, "short_answer_loss": NaN, "step": 557, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.1337, "grad_norm": 1.6953125, "learning_rate": 1.507110130228386e-05, "long_answer_loss": 0.1337, "loss": 0.1562, "short_answer_loss": NaN, "step": 558, "template_loss": 0.0 }, { "epoch": 0.9, "full_loss": 0.1507, "grad_norm": 1.6484375, "learning_rate": 1.5039013848151839e-05, "long_answer_loss": 0.1507, "loss": 0.1464, "short_answer_loss": NaN, "step": 559, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.1295, "grad_norm": 1.640625, "learning_rate": 1.5006908933750829e-05, "long_answer_loss": 0.1295, "loss": 0.1484, "short_answer_loss": NaN, "step": 560, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.1581, "grad_norm": 1.65625, "learning_rate": 1.4974786779859642e-05, "long_answer_loss": 0.1581, "loss": 0.1511, "short_answer_loss": NaN, "step": 561, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.1504, "grad_norm": 1.6640625, "learning_rate": 1.4942647607375629e-05, "long_answer_loss": 0.1504, "loss": 0.1664, "short_answer_loss": NaN, "step": 562, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.1607, "grad_norm": 1.8046875, "learning_rate": 1.4910491637313176e-05, "long_answer_loss": 0.1607, "loss": 0.1653, "short_answer_loss": NaN, "step": 563, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.168, "grad_norm": 1.6875, "learning_rate": 1.4878319090802196e-05, "long_answer_loss": 0.168, "loss": 0.1615, "short_answer_loss": NaN, "step": 564, "template_loss": 0.0 }, { "epoch": 0.91, "full_loss": 0.1271, "grad_norm": 1.6015625, "learning_rate": 1.4846130189086577e-05, "long_answer_loss": 0.1271, "loss": 0.1421, "short_answer_loss": NaN, "step": 565, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1694, "grad_norm": 1.6875, "learning_rate": 1.4813925153522693e-05, "long_answer_loss": 0.1694, "loss": 0.1608, "short_answer_loss": NaN, "step": 566, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1439, "grad_norm": 1.640625, "learning_rate": 1.4781704205577856e-05, "long_answer_loss": 0.1439, "loss": 0.1462, "short_answer_loss": NaN, "step": 567, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1338, "grad_norm": 1.828125, "learning_rate": 1.4749467566828808e-05, "long_answer_loss": 0.1338, "loss": 0.153, "short_answer_loss": NaN, "step": 568, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.169, "grad_norm": 1.7890625, "learning_rate": 1.4717215458960198e-05, "long_answer_loss": 0.169, "loss": 0.1626, "short_answer_loss": NaN, "step": 569, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1542, "grad_norm": 1.7265625, "learning_rate": 1.4684948103763046e-05, "long_answer_loss": 0.1542, "loss": 0.1596, "short_answer_loss": NaN, "step": 570, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1458, "grad_norm": 1.7578125, "learning_rate": 1.465266572313323e-05, "long_answer_loss": 0.1458, "loss": 0.1556, "short_answer_loss": NaN, "step": 571, "template_loss": 0.0 }, { "epoch": 0.92, "full_loss": 0.1489, "grad_norm": 1.828125, "learning_rate": 1.462036853906995e-05, "long_answer_loss": 0.1489, "loss": 0.1652, "short_answer_loss": NaN, "step": 572, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1884, "grad_norm": 1.78125, "learning_rate": 1.4588056773674209e-05, "long_answer_loss": 0.1884, "loss": 0.164, "short_answer_loss": NaN, "step": 573, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1575, "grad_norm": 1.734375, "learning_rate": 1.4555730649147285e-05, "long_answer_loss": 0.1575, "loss": 0.1548, "short_answer_loss": NaN, "step": 574, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1655, "grad_norm": 1.7734375, "learning_rate": 1.4523390387789193e-05, "long_answer_loss": 0.1655, "loss": 0.1563, "short_answer_loss": NaN, "step": 575, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1531, "grad_norm": 1.859375, "learning_rate": 1.4491036211997175e-05, "long_answer_loss": 0.1531, "loss": 0.1585, "short_answer_loss": NaN, "step": 576, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1501, "grad_norm": 1.875, "learning_rate": 1.4458668344264151e-05, "long_answer_loss": 0.1501, "loss": 0.1459, "short_answer_loss": NaN, "step": 577, "template_loss": 0.0 }, { "epoch": 0.93, "full_loss": 0.1541, "grad_norm": 1.75, "learning_rate": 1.4426287007177197e-05, "long_answer_loss": 0.1541, "loss": 0.1511, "short_answer_loss": NaN, "step": 578, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1538, "grad_norm": 1.578125, "learning_rate": 1.4393892423416025e-05, "long_answer_loss": 0.1538, "loss": 0.1514, "short_answer_loss": NaN, "step": 579, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1665, "grad_norm": 1.5703125, "learning_rate": 1.4361484815751434e-05, "long_answer_loss": 0.1665, "loss": 0.1508, "short_answer_loss": NaN, "step": 580, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1698, "grad_norm": 1.6796875, "learning_rate": 1.432906440704378e-05, "long_answer_loss": 0.1698, "loss": 0.155, "short_answer_loss": NaN, "step": 581, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1477, "grad_norm": 1.6796875, "learning_rate": 1.4296631420241463e-05, "long_answer_loss": 0.1477, "loss": 0.1553, "short_answer_loss": NaN, "step": 582, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1408, "grad_norm": 1.703125, "learning_rate": 1.4264186078379369e-05, "long_answer_loss": 0.1408, "loss": 0.1527, "short_answer_loss": NaN, "step": 583, "template_loss": 0.0 }, { "epoch": 0.94, "full_loss": 0.1557, "grad_norm": 1.7109375, "learning_rate": 1.4231728604577352e-05, "long_answer_loss": 0.1557, "loss": 0.1566, "short_answer_loss": NaN, "step": 584, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1524, "grad_norm": 1.8203125, "learning_rate": 1.4199259222038694e-05, "long_answer_loss": 0.1524, "loss": 0.1529, "short_answer_loss": NaN, "step": 585, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1409, "grad_norm": 1.640625, "learning_rate": 1.416677815404857e-05, "long_answer_loss": 0.1409, "loss": 0.1479, "short_answer_loss": NaN, "step": 586, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1292, "grad_norm": 1.78125, "learning_rate": 1.4134285623972514e-05, "long_answer_loss": 0.1292, "loss": 0.1496, "short_answer_loss": NaN, "step": 587, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1612, "grad_norm": 1.6875, "learning_rate": 1.4101781855254883e-05, "long_answer_loss": 0.1612, "loss": 0.1509, "short_answer_loss": NaN, "step": 588, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1768, "grad_norm": 1.78125, "learning_rate": 1.406926707141732e-05, "long_answer_loss": 0.1768, "loss": 0.1594, "short_answer_loss": NaN, "step": 589, "template_loss": 0.0 }, { "epoch": 0.95, "full_loss": 0.1408, "grad_norm": 1.6953125, "learning_rate": 1.4036741496057213e-05, "long_answer_loss": 0.1408, "loss": 0.1453, "short_answer_loss": NaN, "step": 590, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.1442, "grad_norm": 1.7734375, "learning_rate": 1.4004205352846164e-05, "long_answer_loss": 0.1442, "loss": 0.1534, "short_answer_loss": NaN, "step": 591, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.1261, "grad_norm": 1.78125, "learning_rate": 1.3971658865528451e-05, "long_answer_loss": 0.1261, "loss": 0.1595, "short_answer_loss": NaN, "step": 592, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.182, "grad_norm": 1.734375, "learning_rate": 1.3939102257919481e-05, "long_answer_loss": 0.182, "loss": 0.1481, "short_answer_loss": NaN, "step": 593, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.1522, "grad_norm": 1.75, "learning_rate": 1.390653575390426e-05, "long_answer_loss": 0.1522, "loss": 0.1629, "short_answer_loss": NaN, "step": 594, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.1329, "grad_norm": 1.8125, "learning_rate": 1.3873959577435847e-05, "long_answer_loss": 0.1329, "loss": 0.1608, "short_answer_loss": NaN, "step": 595, "template_loss": 0.0 }, { "epoch": 0.96, "full_loss": 0.1544, "grad_norm": 1.7265625, "learning_rate": 1.3841373952533812e-05, "long_answer_loss": 0.1544, "loss": 0.1516, "short_answer_loss": NaN, "step": 596, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1549, "grad_norm": 1.671875, "learning_rate": 1.3808779103282712e-05, "long_answer_loss": 0.1549, "loss": 0.1498, "short_answer_loss": NaN, "step": 597, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1468, "grad_norm": 1.625, "learning_rate": 1.3776175253830531e-05, "long_answer_loss": 0.1468, "loss": 0.1496, "short_answer_loss": NaN, "step": 598, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1515, "grad_norm": 1.703125, "learning_rate": 1.3743562628387141e-05, "long_answer_loss": 0.1515, "loss": 0.1535, "short_answer_loss": NaN, "step": 599, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1576, "grad_norm": 1.734375, "learning_rate": 1.3710941451222776e-05, "long_answer_loss": 0.1576, "loss": 0.1506, "short_answer_loss": NaN, "step": 600, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1565, "grad_norm": 1.6015625, "learning_rate": 1.367831194666646e-05, "long_answer_loss": 0.1565, "loss": 0.1484, "short_answer_loss": NaN, "step": 601, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1385, "grad_norm": 1.765625, "learning_rate": 1.3645674339104508e-05, "long_answer_loss": 0.1385, "loss": 0.1581, "short_answer_loss": NaN, "step": 602, "template_loss": 0.0 }, { "epoch": 0.97, "full_loss": 0.1386, "grad_norm": 1.7265625, "learning_rate": 1.3613028852978934e-05, "long_answer_loss": 0.1386, "loss": 0.1505, "short_answer_loss": NaN, "step": 603, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.1686, "grad_norm": 1.625, "learning_rate": 1.3580375712785945e-05, "long_answer_loss": 0.1686, "loss": 0.1552, "short_answer_loss": NaN, "step": 604, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.1538, "grad_norm": 1.71875, "learning_rate": 1.354771514307438e-05, "long_answer_loss": 0.1538, "loss": 0.1548, "short_answer_loss": NaN, "step": 605, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.1412, "grad_norm": 1.65625, "learning_rate": 1.3515047368444169e-05, "long_answer_loss": 0.1412, "loss": 0.1488, "short_answer_loss": NaN, "step": 606, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.1467, "grad_norm": 1.6171875, "learning_rate": 1.3482372613544788e-05, "long_answer_loss": 0.1467, "loss": 0.15, "short_answer_loss": NaN, "step": 607, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.191, "grad_norm": 1.671875, "learning_rate": 1.3449691103073714e-05, "long_answer_loss": 0.191, "loss": 0.1575, "short_answer_loss": NaN, "step": 608, "template_loss": 0.0 }, { "epoch": 0.98, "full_loss": 0.1561, "grad_norm": 1.6171875, "learning_rate": 1.3417003061774886e-05, "long_answer_loss": 0.1561, "loss": 0.1569, "short_answer_loss": NaN, "step": 609, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1616, "grad_norm": 1.6875, "learning_rate": 1.3384308714437146e-05, "long_answer_loss": 0.1616, "loss": 0.1667, "short_answer_loss": NaN, "step": 610, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1367, "grad_norm": 1.6484375, "learning_rate": 1.3351608285892708e-05, "long_answer_loss": 0.1367, "loss": 0.1531, "short_answer_loss": NaN, "step": 611, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1453, "grad_norm": 1.71875, "learning_rate": 1.3318902001015602e-05, "long_answer_loss": 0.1453, "loss": 0.1489, "short_answer_loss": NaN, "step": 612, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1351, "grad_norm": 1.7265625, "learning_rate": 1.328619008472013e-05, "long_answer_loss": 0.1351, "loss": 0.1525, "short_answer_loss": NaN, "step": 613, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1462, "grad_norm": 1.734375, "learning_rate": 1.3253472761959326e-05, "long_answer_loss": 0.1462, "loss": 0.1559, "short_answer_loss": NaN, "step": 614, "template_loss": 0.0 }, { "epoch": 0.99, "full_loss": 0.1419, "grad_norm": 1.7578125, "learning_rate": 1.3220750257723397e-05, "long_answer_loss": 0.1419, "loss": 0.159, "short_answer_loss": NaN, "step": 615, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.1622, "grad_norm": 1.796875, "learning_rate": 1.3188022797038183e-05, "long_answer_loss": 0.1622, "loss": 0.1514, "short_answer_loss": NaN, "step": 616, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.1706, "grad_norm": 1.75, "learning_rate": 1.3155290604963613e-05, "long_answer_loss": 0.1706, "loss": 0.1585, "short_answer_loss": NaN, "step": 617, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.1639, "grad_norm": 1.765625, "learning_rate": 1.3122553906592142e-05, "long_answer_loss": 0.1639, "loss": 0.1556, "short_answer_loss": NaN, "step": 618, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.114, "grad_norm": 1.5703125, "learning_rate": 1.3089812927047224e-05, "long_answer_loss": 0.114, "loss": 0.1224, "short_answer_loss": NaN, "step": 619, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.0845, "grad_norm": 1.4140625, "learning_rate": 1.3057067891481752e-05, "long_answer_loss": 0.0845, "loss": 0.0856, "short_answer_loss": NaN, "step": 620, "template_loss": 0.0 }, { "epoch": 1.0, "full_loss": 0.0776, "grad_norm": 1.4609375, "learning_rate": 1.3024319025076509e-05, "long_answer_loss": 0.0776, "loss": 0.0818, "short_answer_loss": NaN, "step": 621, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0675, "grad_norm": 1.421875, "learning_rate": 1.2991566553038623e-05, "long_answer_loss": 0.0675, "loss": 0.0814, "short_answer_loss": NaN, "step": 622, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0837, "grad_norm": 1.609375, "learning_rate": 1.2958810700600017e-05, "long_answer_loss": 0.0837, "loss": 0.0857, "short_answer_loss": NaN, "step": 623, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0696, "grad_norm": 1.6328125, "learning_rate": 1.2926051693015858e-05, "long_answer_loss": 0.0696, "loss": 0.0811, "short_answer_loss": NaN, "step": 624, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0957, "grad_norm": 1.796875, "learning_rate": 1.2893289755563017e-05, "long_answer_loss": 0.0957, "loss": 0.0848, "short_answer_loss": NaN, "step": 625, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0815, "grad_norm": 1.5546875, "learning_rate": 1.2860525113538505e-05, "long_answer_loss": 0.0815, "loss": 0.0819, "short_answer_loss": NaN, "step": 626, "template_loss": 0.0 }, { "epoch": 1.01, "full_loss": 0.0937, "grad_norm": 1.8203125, "learning_rate": 1.2827757992257939e-05, "long_answer_loss": 0.0937, "loss": 0.0895, "short_answer_loss": NaN, "step": 627, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0934, "grad_norm": 1.8359375, "learning_rate": 1.2794988617053979e-05, "long_answer_loss": 0.0934, "loss": 0.082, "short_answer_loss": NaN, "step": 628, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0839, "grad_norm": 1.8515625, "learning_rate": 1.2762217213274788e-05, "long_answer_loss": 0.0839, "loss": 0.0818, "short_answer_loss": NaN, "step": 629, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0908, "grad_norm": 1.6328125, "learning_rate": 1.2729444006282481e-05, "long_answer_loss": 0.0908, "loss": 0.0785, "short_answer_loss": NaN, "step": 630, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0839, "grad_norm": 1.7109375, "learning_rate": 1.269666922145157e-05, "long_answer_loss": 0.0839, "loss": 0.0829, "short_answer_loss": NaN, "step": 631, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0815, "grad_norm": 1.9140625, "learning_rate": 1.266389308416742e-05, "long_answer_loss": 0.0815, "loss": 0.0868, "short_answer_loss": NaN, "step": 632, "template_loss": 0.0 }, { "epoch": 1.02, "full_loss": 0.0666, "grad_norm": 1.6484375, "learning_rate": 1.2631115819824688e-05, "long_answer_loss": 0.0666, "loss": 0.0788, "short_answer_loss": NaN, "step": 633, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.0738, "grad_norm": 1.6328125, "learning_rate": 1.2598337653825798e-05, "long_answer_loss": 0.0738, "loss": 0.083, "short_answer_loss": NaN, "step": 634, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.08, "grad_norm": 1.7109375, "learning_rate": 1.2565558811579359e-05, "long_answer_loss": 0.08, "loss": 0.0805, "short_answer_loss": NaN, "step": 635, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.0682, "grad_norm": 1.5234375, "learning_rate": 1.2532779518498639e-05, "long_answer_loss": 0.0682, "loss": 0.0826, "short_answer_loss": NaN, "step": 636, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.0862, "grad_norm": 1.5390625, "learning_rate": 1.25e-05, "long_answer_loss": 0.0862, "loss": 0.08, "short_answer_loss": NaN, "step": 637, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.0853, "grad_norm": 1.5234375, "learning_rate": 1.2467220481501365e-05, "long_answer_loss": 0.0853, "loss": 0.0784, "short_answer_loss": NaN, "step": 638, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.0645, "grad_norm": 1.4453125, "learning_rate": 1.2434441188420644e-05, "long_answer_loss": 0.0645, "loss": 0.0767, "short_answer_loss": NaN, "step": 639, "template_loss": 0.0 }, { "epoch": 1.03, "full_loss": 0.1023, "grad_norm": 1.578125, "learning_rate": 1.2401662346174206e-05, "long_answer_loss": 0.1023, "loss": 0.0801, "short_answer_loss": NaN, "step": 640, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0786, "grad_norm": 1.484375, "learning_rate": 1.2368884180175313e-05, "long_answer_loss": 0.0786, "loss": 0.0799, "short_answer_loss": NaN, "step": 641, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0839, "grad_norm": 1.4921875, "learning_rate": 1.2336106915832585e-05, "long_answer_loss": 0.0839, "loss": 0.0813, "short_answer_loss": NaN, "step": 642, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0719, "grad_norm": 1.46875, "learning_rate": 1.2303330778548433e-05, "long_answer_loss": 0.0719, "loss": 0.0821, "short_answer_loss": NaN, "step": 643, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0633, "grad_norm": 1.484375, "learning_rate": 1.2270555993717521e-05, "long_answer_loss": 0.0633, "loss": 0.0741, "short_answer_loss": NaN, "step": 644, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0928, "grad_norm": 1.546875, "learning_rate": 1.2237782786725215e-05, "long_answer_loss": 0.0928, "loss": 0.0798, "short_answer_loss": NaN, "step": 645, "template_loss": 0.0 }, { "epoch": 1.04, "full_loss": 0.0697, "grad_norm": 1.484375, "learning_rate": 1.2205011382946024e-05, "long_answer_loss": 0.0697, "loss": 0.0757, "short_answer_loss": NaN, "step": 646, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.0761, "grad_norm": 1.46875, "learning_rate": 1.2172242007742066e-05, "long_answer_loss": 0.0761, "loss": 0.0738, "short_answer_loss": NaN, "step": 647, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.0855, "grad_norm": 1.640625, "learning_rate": 1.21394748864615e-05, "long_answer_loss": 0.0855, "loss": 0.0834, "short_answer_loss": NaN, "step": 648, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.0786, "grad_norm": 1.625, "learning_rate": 1.210671024443699e-05, "long_answer_loss": 0.0786, "loss": 0.0752, "short_answer_loss": NaN, "step": 649, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.0761, "grad_norm": 1.640625, "learning_rate": 1.2073948306984148e-05, "long_answer_loss": 0.0761, "loss": 0.0794, "short_answer_loss": NaN, "step": 650, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.093, "grad_norm": 1.5390625, "learning_rate": 1.2041189299399991e-05, "long_answer_loss": 0.093, "loss": 0.0855, "short_answer_loss": NaN, "step": 651, "template_loss": 0.0 }, { "epoch": 1.05, "full_loss": 0.0824, "grad_norm": 1.5234375, "learning_rate": 1.2008433446961384e-05, "long_answer_loss": 0.0824, "loss": 0.0753, "short_answer_loss": NaN, "step": 652, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0785, "grad_norm": 1.53125, "learning_rate": 1.1975680974923497e-05, "long_answer_loss": 0.0785, "loss": 0.0746, "short_answer_loss": NaN, "step": 653, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0799, "grad_norm": 1.734375, "learning_rate": 1.194293210851825e-05, "long_answer_loss": 0.0799, "loss": 0.0826, "short_answer_loss": NaN, "step": 654, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0642, "grad_norm": 1.5078125, "learning_rate": 1.1910187072952779e-05, "long_answer_loss": 0.0642, "loss": 0.0764, "short_answer_loss": NaN, "step": 655, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0772, "grad_norm": 1.53125, "learning_rate": 1.1877446093407861e-05, "long_answer_loss": 0.0772, "loss": 0.0729, "short_answer_loss": NaN, "step": 656, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0799, "grad_norm": 1.578125, "learning_rate": 1.184470939503639e-05, "long_answer_loss": 0.0799, "loss": 0.0781, "short_answer_loss": NaN, "step": 657, "template_loss": 0.0 }, { "epoch": 1.06, "full_loss": 0.0733, "grad_norm": 1.734375, "learning_rate": 1.1811977202961817e-05, "long_answer_loss": 0.0733, "loss": 0.0795, "short_answer_loss": NaN, "step": 658, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.089, "grad_norm": 1.5390625, "learning_rate": 1.1779249742276603e-05, "long_answer_loss": 0.089, "loss": 0.0743, "short_answer_loss": NaN, "step": 659, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.0853, "grad_norm": 1.59375, "learning_rate": 1.1746527238040674e-05, "long_answer_loss": 0.0853, "loss": 0.0819, "short_answer_loss": NaN, "step": 660, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.069, "grad_norm": 1.5703125, "learning_rate": 1.171380991527987e-05, "long_answer_loss": 0.069, "loss": 0.0769, "short_answer_loss": NaN, "step": 661, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.0942, "grad_norm": 1.671875, "learning_rate": 1.1681097998984401e-05, "long_answer_loss": 0.0942, "loss": 0.0812, "short_answer_loss": NaN, "step": 662, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.0842, "grad_norm": 1.5703125, "learning_rate": 1.1648391714107295e-05, "long_answer_loss": 0.0842, "loss": 0.0781, "short_answer_loss": NaN, "step": 663, "template_loss": 0.0 }, { "epoch": 1.07, "full_loss": 0.0594, "grad_norm": 1.5703125, "learning_rate": 1.1615691285562857e-05, "long_answer_loss": 0.0594, "loss": 0.076, "short_answer_loss": NaN, "step": 664, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0882, "grad_norm": 1.6640625, "learning_rate": 1.1582996938225119e-05, "long_answer_loss": 0.0882, "loss": 0.0805, "short_answer_loss": NaN, "step": 665, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0722, "grad_norm": 1.4921875, "learning_rate": 1.1550308896926288e-05, "long_answer_loss": 0.0722, "loss": 0.0722, "short_answer_loss": NaN, "step": 666, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0663, "grad_norm": 1.5546875, "learning_rate": 1.1517627386455215e-05, "long_answer_loss": 0.0663, "loss": 0.0778, "short_answer_loss": NaN, "step": 667, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0864, "grad_norm": 1.609375, "learning_rate": 1.1484952631555834e-05, "long_answer_loss": 0.0864, "loss": 0.0764, "short_answer_loss": NaN, "step": 668, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0743, "grad_norm": 1.484375, "learning_rate": 1.1452284856925621e-05, "long_answer_loss": 0.0743, "loss": 0.0791, "short_answer_loss": NaN, "step": 669, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0793, "grad_norm": 1.609375, "learning_rate": 1.1419624287214057e-05, "long_answer_loss": 0.0793, "loss": 0.0777, "short_answer_loss": NaN, "step": 670, "template_loss": 0.0 }, { "epoch": 1.08, "full_loss": 0.0805, "grad_norm": 1.6796875, "learning_rate": 1.1386971147021067e-05, "long_answer_loss": 0.0805, "loss": 0.0801, "short_answer_loss": NaN, "step": 671, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.0593, "grad_norm": 1.5078125, "learning_rate": 1.1354325660895496e-05, "long_answer_loss": 0.0593, "loss": 0.0735, "short_answer_loss": NaN, "step": 672, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.0708, "grad_norm": 1.609375, "learning_rate": 1.132168805333354e-05, "long_answer_loss": 0.0708, "loss": 0.0755, "short_answer_loss": NaN, "step": 673, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.0748, "grad_norm": 1.5546875, "learning_rate": 1.1289058548777229e-05, "long_answer_loss": 0.0748, "loss": 0.0804, "short_answer_loss": NaN, "step": 674, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.089, "grad_norm": 1.46875, "learning_rate": 1.125643737161286e-05, "long_answer_loss": 0.089, "loss": 0.0764, "short_answer_loss": NaN, "step": 675, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.0728, "grad_norm": 1.578125, "learning_rate": 1.1223824746169472e-05, "long_answer_loss": 0.0728, "loss": 0.0801, "short_answer_loss": NaN, "step": 676, "template_loss": 0.0 }, { "epoch": 1.09, "full_loss": 0.0789, "grad_norm": 1.4453125, "learning_rate": 1.119122089671729e-05, "long_answer_loss": 0.0789, "loss": 0.0721, "short_answer_loss": NaN, "step": 677, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.0764, "grad_norm": 1.6484375, "learning_rate": 1.1158626047466191e-05, "long_answer_loss": 0.0764, "loss": 0.0802, "short_answer_loss": NaN, "step": 678, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.085, "grad_norm": 1.5, "learning_rate": 1.112604042256416e-05, "long_answer_loss": 0.085, "loss": 0.0775, "short_answer_loss": NaN, "step": 679, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.0571, "grad_norm": 1.5234375, "learning_rate": 1.1093464246095746e-05, "long_answer_loss": 0.0571, "loss": 0.0771, "short_answer_loss": NaN, "step": 680, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.0718, "grad_norm": 1.515625, "learning_rate": 1.1060897742080525e-05, "long_answer_loss": 0.0718, "loss": 0.0747, "short_answer_loss": NaN, "step": 681, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.0846, "grad_norm": 1.65625, "learning_rate": 1.1028341134471554e-05, "long_answer_loss": 0.0846, "loss": 0.0819, "short_answer_loss": NaN, "step": 682, "template_loss": 0.0 }, { "epoch": 1.1, "full_loss": 0.0797, "grad_norm": 1.5625, "learning_rate": 1.0995794647153842e-05, "long_answer_loss": 0.0797, "loss": 0.0793, "short_answer_loss": NaN, "step": 683, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.078, "grad_norm": 1.6953125, "learning_rate": 1.0963258503942795e-05, "long_answer_loss": 0.078, "loss": 0.0824, "short_answer_loss": NaN, "step": 684, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.1139, "grad_norm": 1.4375, "learning_rate": 1.0930732928582687e-05, "long_answer_loss": 0.1139, "loss": 0.0814, "short_answer_loss": NaN, "step": 685, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.0878, "grad_norm": 1.578125, "learning_rate": 1.0898218144745123e-05, "long_answer_loss": 0.0878, "loss": 0.076, "short_answer_loss": NaN, "step": 686, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.0792, "grad_norm": 1.53125, "learning_rate": 1.0865714376027488e-05, "long_answer_loss": 0.0792, "loss": 0.0737, "short_answer_loss": NaN, "step": 687, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.0748, "grad_norm": 1.640625, "learning_rate": 1.0833221845951433e-05, "long_answer_loss": 0.0748, "loss": 0.0741, "short_answer_loss": NaN, "step": 688, "template_loss": 0.0 }, { "epoch": 1.11, "full_loss": 0.0648, "grad_norm": 1.453125, "learning_rate": 1.080074077796131e-05, "long_answer_loss": 0.0648, "loss": 0.0709, "short_answer_loss": NaN, "step": 689, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.0664, "grad_norm": 1.5234375, "learning_rate": 1.0768271395422651e-05, "long_answer_loss": 0.0664, "loss": 0.073, "short_answer_loss": NaN, "step": 690, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.0841, "grad_norm": 1.484375, "learning_rate": 1.0735813921620634e-05, "long_answer_loss": 0.0841, "loss": 0.0769, "short_answer_loss": NaN, "step": 691, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.079, "grad_norm": 1.6796875, "learning_rate": 1.070336857975854e-05, "long_answer_loss": 0.079, "loss": 0.0814, "short_answer_loss": NaN, "step": 692, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.0902, "grad_norm": 1.65625, "learning_rate": 1.0670935592956223e-05, "long_answer_loss": 0.0902, "loss": 0.0773, "short_answer_loss": NaN, "step": 693, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.0867, "grad_norm": 1.4609375, "learning_rate": 1.0638515184248571e-05, "long_answer_loss": 0.0867, "loss": 0.0734, "short_answer_loss": NaN, "step": 694, "template_loss": 0.0 }, { "epoch": 1.12, "full_loss": 0.0694, "grad_norm": 1.5, "learning_rate": 1.0606107576583976e-05, "long_answer_loss": 0.0694, "loss": 0.0738, "short_answer_loss": NaN, "step": 695, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.1009, "grad_norm": 1.59375, "learning_rate": 1.0573712992822804e-05, "long_answer_loss": 0.1009, "loss": 0.0839, "short_answer_loss": NaN, "step": 696, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.0949, "grad_norm": 1.484375, "learning_rate": 1.0541331655735853e-05, "long_answer_loss": 0.0949, "loss": 0.0754, "short_answer_loss": NaN, "step": 697, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.0804, "grad_norm": 1.5703125, "learning_rate": 1.0508963788002827e-05, "long_answer_loss": 0.0804, "loss": 0.0738, "short_answer_loss": NaN, "step": 698, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.0741, "grad_norm": 1.5703125, "learning_rate": 1.0476609612210808e-05, "long_answer_loss": 0.0741, "loss": 0.073, "short_answer_loss": NaN, "step": 699, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.0814, "grad_norm": 1.65625, "learning_rate": 1.0444269350852718e-05, "long_answer_loss": 0.0814, "loss": 0.0791, "short_answer_loss": NaN, "step": 700, "template_loss": 0.0 }, { "epoch": 1.13, "full_loss": 0.0724, "grad_norm": 1.578125, "learning_rate": 1.0411943226325793e-05, "long_answer_loss": 0.0724, "loss": 0.071, "short_answer_loss": NaN, "step": 701, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.084, "grad_norm": 1.546875, "learning_rate": 1.0379631460930054e-05, "long_answer_loss": 0.084, "loss": 0.0769, "short_answer_loss": NaN, "step": 702, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.0656, "grad_norm": 1.671875, "learning_rate": 1.0347334276866772e-05, "long_answer_loss": 0.0656, "loss": 0.0793, "short_answer_loss": NaN, "step": 703, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.0907, "grad_norm": 1.6953125, "learning_rate": 1.0315051896236955e-05, "long_answer_loss": 0.0907, "loss": 0.0867, "short_answer_loss": NaN, "step": 704, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.073, "grad_norm": 1.625, "learning_rate": 1.0282784541039804e-05, "long_answer_loss": 0.073, "loss": 0.0808, "short_answer_loss": NaN, "step": 705, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.0825, "grad_norm": 1.6328125, "learning_rate": 1.0250532433171194e-05, "long_answer_loss": 0.0825, "loss": 0.0803, "short_answer_loss": NaN, "step": 706, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.0669, "grad_norm": 1.53125, "learning_rate": 1.0218295794422147e-05, "long_answer_loss": 0.0669, "loss": 0.0733, "short_answer_loss": NaN, "step": 707, "template_loss": 0.0 }, { "epoch": 1.14, "full_loss": 0.08, "grad_norm": 1.65625, "learning_rate": 1.018607484647731e-05, "long_answer_loss": 0.08, "loss": 0.0771, "short_answer_loss": NaN, "step": 708, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.0723, "grad_norm": 1.5703125, "learning_rate": 1.0153869810913424e-05, "long_answer_loss": 0.0723, "loss": 0.0799, "short_answer_loss": NaN, "step": 709, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.0776, "grad_norm": 1.6171875, "learning_rate": 1.0121680909197809e-05, "long_answer_loss": 0.0776, "loss": 0.0761, "short_answer_loss": NaN, "step": 710, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.0681, "grad_norm": 1.546875, "learning_rate": 1.0089508362686827e-05, "long_answer_loss": 0.0681, "loss": 0.0792, "short_answer_loss": NaN, "step": 711, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.0799, "grad_norm": 1.546875, "learning_rate": 1.0057352392624377e-05, "long_answer_loss": 0.0799, "loss": 0.079, "short_answer_loss": NaN, "step": 712, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.074, "grad_norm": 1.6015625, "learning_rate": 1.0025213220140364e-05, "long_answer_loss": 0.074, "loss": 0.0752, "short_answer_loss": NaN, "step": 713, "template_loss": 0.0 }, { "epoch": 1.15, "full_loss": 0.0629, "grad_norm": 1.578125, "learning_rate": 9.993091066249174e-06, "long_answer_loss": 0.0629, "loss": 0.0749, "short_answer_loss": NaN, "step": 714, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0891, "grad_norm": 1.5546875, "learning_rate": 9.960986151848167e-06, "long_answer_loss": 0.0891, "loss": 0.0742, "short_answer_loss": NaN, "step": 715, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0686, "grad_norm": 1.5, "learning_rate": 9.928898697716147e-06, "long_answer_loss": 0.0686, "loss": 0.0675, "short_answer_loss": NaN, "step": 716, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0818, "grad_norm": 1.453125, "learning_rate": 9.896828924511845e-06, "long_answer_loss": 0.0818, "loss": 0.0782, "short_answer_loss": NaN, "step": 717, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0696, "grad_norm": 1.53125, "learning_rate": 9.864777052772407e-06, "long_answer_loss": 0.0696, "loss": 0.077, "short_answer_loss": NaN, "step": 718, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0777, "grad_norm": 1.5546875, "learning_rate": 9.832743302911876e-06, "long_answer_loss": 0.0777, "loss": 0.0785, "short_answer_loss": NaN, "step": 719, "template_loss": 0.0 }, { "epoch": 1.16, "full_loss": 0.0767, "grad_norm": 1.5703125, "learning_rate": 9.800727895219672e-06, "long_answer_loss": 0.0767, "loss": 0.0728, "short_answer_loss": NaN, "step": 720, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.0671, "grad_norm": 1.5, "learning_rate": 9.768731049859073e-06, "long_answer_loss": 0.0671, "loss": 0.0729, "short_answer_loss": NaN, "step": 721, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.072, "grad_norm": 1.546875, "learning_rate": 9.736752986865727e-06, "long_answer_loss": 0.072, "loss": 0.0773, "short_answer_loss": NaN, "step": 722, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.0713, "grad_norm": 1.5078125, "learning_rate": 9.704793926146102e-06, "long_answer_loss": 0.0713, "loss": 0.0699, "short_answer_loss": NaN, "step": 723, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.0717, "grad_norm": 1.484375, "learning_rate": 9.672854087475997e-06, "long_answer_loss": 0.0717, "loss": 0.0706, "short_answer_loss": NaN, "step": 724, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.072, "grad_norm": 1.5078125, "learning_rate": 9.640933690499027e-06, "long_answer_loss": 0.072, "loss": 0.0711, "short_answer_loss": NaN, "step": 725, "template_loss": 0.0 }, { "epoch": 1.17, "full_loss": 0.0798, "grad_norm": 1.53125, "learning_rate": 9.609032954725104e-06, "long_answer_loss": 0.0798, "loss": 0.0744, "short_answer_loss": NaN, "step": 726, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0731, "grad_norm": 1.4765625, "learning_rate": 9.57715209952894e-06, "long_answer_loss": 0.0731, "loss": 0.069, "short_answer_loss": NaN, "step": 727, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0856, "grad_norm": 1.4453125, "learning_rate": 9.54529134414853e-06, "long_answer_loss": 0.0856, "loss": 0.0762, "short_answer_loss": NaN, "step": 728, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0758, "grad_norm": 1.6875, "learning_rate": 9.51345090768365e-06, "long_answer_loss": 0.0758, "loss": 0.0718, "short_answer_loss": NaN, "step": 729, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0776, "grad_norm": 1.5234375, "learning_rate": 9.481631009094341e-06, "long_answer_loss": 0.0776, "loss": 0.0745, "short_answer_loss": NaN, "step": 730, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0583, "grad_norm": 1.5078125, "learning_rate": 9.449831867199416e-06, "long_answer_loss": 0.0583, "loss": 0.0719, "short_answer_loss": NaN, "step": 731, "template_loss": 0.0 }, { "epoch": 1.18, "full_loss": 0.0661, "grad_norm": 1.4921875, "learning_rate": 9.418053700674944e-06, "long_answer_loss": 0.0661, "loss": 0.0754, "short_answer_loss": NaN, "step": 732, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0722, "grad_norm": 1.6015625, "learning_rate": 9.386296728052753e-06, "long_answer_loss": 0.0722, "loss": 0.0728, "short_answer_loss": NaN, "step": 733, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0594, "grad_norm": 1.6328125, "learning_rate": 9.354561167718922e-06, "long_answer_loss": 0.0594, "loss": 0.0725, "short_answer_loss": NaN, "step": 734, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0713, "grad_norm": 1.65625, "learning_rate": 9.322847237912288e-06, "long_answer_loss": 0.0713, "loss": 0.073, "short_answer_loss": NaN, "step": 735, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.1066, "grad_norm": 1.5859375, "learning_rate": 9.29115515672293e-06, "long_answer_loss": 0.1066, "loss": 0.0806, "short_answer_loss": NaN, "step": 736, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0749, "grad_norm": 1.609375, "learning_rate": 9.25948514209069e-06, "long_answer_loss": 0.0749, "loss": 0.0761, "short_answer_loss": NaN, "step": 737, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0544, "grad_norm": 1.5703125, "learning_rate": 9.227837411803656e-06, "long_answer_loss": 0.0544, "loss": 0.0718, "short_answer_loss": NaN, "step": 738, "template_loss": 0.0 }, { "epoch": 1.19, "full_loss": 0.0895, "grad_norm": 1.546875, "learning_rate": 9.196212183496669e-06, "long_answer_loss": 0.0895, "loss": 0.0714, "short_answer_loss": NaN, "step": 739, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.1009, "grad_norm": 1.546875, "learning_rate": 9.164609674649835e-06, "long_answer_loss": 0.1009, "loss": 0.0761, "short_answer_loss": NaN, "step": 740, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.0684, "grad_norm": 1.6171875, "learning_rate": 9.133030102587019e-06, "long_answer_loss": 0.0684, "loss": 0.0754, "short_answer_loss": NaN, "step": 741, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.0647, "grad_norm": 1.5703125, "learning_rate": 9.101473684474354e-06, "long_answer_loss": 0.0647, "loss": 0.0785, "short_answer_loss": NaN, "step": 742, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.0748, "grad_norm": 1.4609375, "learning_rate": 9.069940637318752e-06, "long_answer_loss": 0.0748, "loss": 0.0718, "short_answer_loss": NaN, "step": 743, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.0662, "grad_norm": 1.46875, "learning_rate": 9.038431177966406e-06, "long_answer_loss": 0.0662, "loss": 0.0744, "short_answer_loss": NaN, "step": 744, "template_loss": 0.0 }, { "epoch": 1.2, "full_loss": 0.0857, "grad_norm": 1.4453125, "learning_rate": 9.006945523101295e-06, "long_answer_loss": 0.0857, "loss": 0.0719, "short_answer_loss": NaN, "step": 745, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0737, "grad_norm": 1.484375, "learning_rate": 8.975483889243709e-06, "long_answer_loss": 0.0737, "loss": 0.0692, "short_answer_loss": NaN, "step": 746, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0732, "grad_norm": 1.4609375, "learning_rate": 8.944046492748746e-06, "long_answer_loss": 0.0732, "loss": 0.0711, "short_answer_loss": NaN, "step": 747, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0683, "grad_norm": 1.546875, "learning_rate": 8.912633549804824e-06, "long_answer_loss": 0.0683, "loss": 0.0704, "short_answer_loss": NaN, "step": 748, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0797, "grad_norm": 1.484375, "learning_rate": 8.88124527643221e-06, "long_answer_loss": 0.0797, "loss": 0.0754, "short_answer_loss": NaN, "step": 749, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0627, "grad_norm": 1.5, "learning_rate": 8.849881888481513e-06, "long_answer_loss": 0.0627, "loss": 0.0738, "short_answer_loss": NaN, "step": 750, "template_loss": 0.0 }, { "epoch": 1.21, "full_loss": 0.0746, "grad_norm": 1.5546875, "learning_rate": 8.818543601632215e-06, "long_answer_loss": 0.0746, "loss": 0.0736, "short_answer_loss": NaN, "step": 751, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0773, "grad_norm": 1.4765625, "learning_rate": 8.787230631391185e-06, "long_answer_loss": 0.0773, "loss": 0.0713, "short_answer_loss": NaN, "step": 752, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0852, "grad_norm": 1.5703125, "learning_rate": 8.755943193091187e-06, "long_answer_loss": 0.0852, "loss": 0.0733, "short_answer_loss": NaN, "step": 753, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0669, "grad_norm": 1.4765625, "learning_rate": 8.724681501889413e-06, "long_answer_loss": 0.0669, "loss": 0.0706, "short_answer_loss": NaN, "step": 754, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0729, "grad_norm": 1.609375, "learning_rate": 8.693445772766003e-06, "long_answer_loss": 0.0729, "loss": 0.0744, "short_answer_loss": NaN, "step": 755, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0827, "grad_norm": 1.453125, "learning_rate": 8.662236220522554e-06, "long_answer_loss": 0.0827, "loss": 0.0755, "short_answer_loss": NaN, "step": 756, "template_loss": 0.0 }, { "epoch": 1.22, "full_loss": 0.0792, "grad_norm": 1.5390625, "learning_rate": 8.631053059780647e-06, "long_answer_loss": 0.0792, "loss": 0.0745, "short_answer_loss": NaN, "step": 757, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0687, "grad_norm": 1.53125, "learning_rate": 8.599896504980384e-06, "long_answer_loss": 0.0687, "loss": 0.0755, "short_answer_loss": NaN, "step": 758, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0661, "grad_norm": 1.4296875, "learning_rate": 8.568766770378892e-06, "long_answer_loss": 0.0661, "loss": 0.0718, "short_answer_loss": NaN, "step": 759, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0701, "grad_norm": 1.5390625, "learning_rate": 8.537664070048867e-06, "long_answer_loss": 0.0701, "loss": 0.0723, "short_answer_loss": NaN, "step": 760, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0782, "grad_norm": 1.5078125, "learning_rate": 8.506588617877102e-06, "long_answer_loss": 0.0782, "loss": 0.0786, "short_answer_loss": NaN, "step": 761, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0597, "grad_norm": 1.5390625, "learning_rate": 8.475540627563e-06, "long_answer_loss": 0.0597, "loss": 0.0724, "short_answer_loss": NaN, "step": 762, "template_loss": 0.0 }, { "epoch": 1.23, "full_loss": 0.0623, "grad_norm": 1.5, "learning_rate": 8.444520312617118e-06, "long_answer_loss": 0.0623, "loss": 0.0708, "short_answer_loss": NaN, "step": 763, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0693, "grad_norm": 1.4921875, "learning_rate": 8.413527886359695e-06, "long_answer_loss": 0.0693, "loss": 0.074, "short_answer_loss": NaN, "step": 764, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0785, "grad_norm": 1.4296875, "learning_rate": 8.382563561919191e-06, "long_answer_loss": 0.0785, "loss": 0.0708, "short_answer_loss": NaN, "step": 765, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0677, "grad_norm": 1.6171875, "learning_rate": 8.351627552230806e-06, "long_answer_loss": 0.0677, "loss": 0.0747, "short_answer_loss": NaN, "step": 766, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0751, "grad_norm": 1.609375, "learning_rate": 8.320720070035035e-06, "long_answer_loss": 0.0751, "loss": 0.0776, "short_answer_loss": NaN, "step": 767, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0693, "grad_norm": 1.53125, "learning_rate": 8.289841327876183e-06, "long_answer_loss": 0.0693, "loss": 0.0737, "short_answer_loss": NaN, "step": 768, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0721, "grad_norm": 1.5234375, "learning_rate": 8.25899153810093e-06, "long_answer_loss": 0.0721, "loss": 0.0712, "short_answer_loss": NaN, "step": 769, "template_loss": 0.0 }, { "epoch": 1.24, "full_loss": 0.0587, "grad_norm": 1.515625, "learning_rate": 8.228170912856845e-06, "long_answer_loss": 0.0587, "loss": 0.0699, "short_answer_loss": NaN, "step": 770, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.0852, "grad_norm": 1.5078125, "learning_rate": 8.197379664090947e-06, "long_answer_loss": 0.0852, "loss": 0.072, "short_answer_loss": NaN, "step": 771, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.0758, "grad_norm": 1.6015625, "learning_rate": 8.166618003548235e-06, "long_answer_loss": 0.0758, "loss": 0.0774, "short_answer_loss": NaN, "step": 772, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.0672, "grad_norm": 1.40625, "learning_rate": 8.135886142770232e-06, "long_answer_loss": 0.0672, "loss": 0.0679, "short_answer_loss": NaN, "step": 773, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.0588, "grad_norm": 1.453125, "learning_rate": 8.105184293093545e-06, "long_answer_loss": 0.0588, "loss": 0.0678, "short_answer_loss": NaN, "step": 774, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.0543, "grad_norm": 1.4765625, "learning_rate": 8.074512665648392e-06, "long_answer_loss": 0.0543, "loss": 0.074, "short_answer_loss": NaN, "step": 775, "template_loss": 0.0 }, { "epoch": 1.25, "full_loss": 0.076, "grad_norm": 1.515625, "learning_rate": 8.04387147135716e-06, "long_answer_loss": 0.076, "loss": 0.0697, "short_answer_loss": NaN, "step": 776, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.0697, "grad_norm": 1.4765625, "learning_rate": 8.013260920932957e-06, "long_answer_loss": 0.0697, "loss": 0.0702, "short_answer_loss": NaN, "step": 777, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.085, "grad_norm": 1.46875, "learning_rate": 7.982681224878157e-06, "long_answer_loss": 0.085, "loss": 0.0701, "short_answer_loss": NaN, "step": 778, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.0754, "grad_norm": 1.5546875, "learning_rate": 7.952132593482956e-06, "long_answer_loss": 0.0754, "loss": 0.0753, "short_answer_loss": NaN, "step": 779, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.0818, "grad_norm": 1.484375, "learning_rate": 7.921615236823924e-06, "long_answer_loss": 0.0818, "loss": 0.0676, "short_answer_loss": NaN, "step": 780, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.0902, "grad_norm": 1.53125, "learning_rate": 7.891129364762559e-06, "long_answer_loss": 0.0902, "loss": 0.0805, "short_answer_loss": NaN, "step": 781, "template_loss": 0.0 }, { "epoch": 1.26, "full_loss": 0.0507, "grad_norm": 1.484375, "learning_rate": 7.860675186943853e-06, "long_answer_loss": 0.0507, "loss": 0.0654, "short_answer_loss": NaN, "step": 782, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0574, "grad_norm": 1.5, "learning_rate": 7.830252912794836e-06, "long_answer_loss": 0.0574, "loss": 0.0741, "short_answer_loss": NaN, "step": 783, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0782, "grad_norm": 1.59375, "learning_rate": 7.799862751523146e-06, "long_answer_loss": 0.0782, "loss": 0.0735, "short_answer_loss": NaN, "step": 784, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0597, "grad_norm": 1.4765625, "learning_rate": 7.769504912115588e-06, "long_answer_loss": 0.0597, "loss": 0.0741, "short_answer_loss": NaN, "step": 785, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0651, "grad_norm": 1.46875, "learning_rate": 7.739179603336696e-06, "long_answer_loss": 0.0651, "loss": 0.0726, "short_answer_loss": NaN, "step": 786, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0657, "grad_norm": 1.4296875, "learning_rate": 7.708887033727291e-06, "long_answer_loss": 0.0657, "loss": 0.0688, "short_answer_loss": NaN, "step": 787, "template_loss": 0.0 }, { "epoch": 1.27, "full_loss": 0.0599, "grad_norm": 1.40625, "learning_rate": 7.678627411603074e-06, "long_answer_loss": 0.0599, "loss": 0.0687, "short_answer_loss": NaN, "step": 788, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0741, "grad_norm": 1.4921875, "learning_rate": 7.648400945053146e-06, "long_answer_loss": 0.0741, "loss": 0.0701, "short_answer_loss": NaN, "step": 789, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0807, "grad_norm": 1.5546875, "learning_rate": 7.618207841938624e-06, "long_answer_loss": 0.0807, "loss": 0.0744, "short_answer_loss": NaN, "step": 790, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0706, "grad_norm": 1.53125, "learning_rate": 7.588048309891181e-06, "long_answer_loss": 0.0706, "loss": 0.0712, "short_answer_loss": NaN, "step": 791, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0788, "grad_norm": 1.4609375, "learning_rate": 7.557922556311634e-06, "long_answer_loss": 0.0788, "loss": 0.0698, "short_answer_loss": NaN, "step": 792, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0771, "grad_norm": 1.4609375, "learning_rate": 7.527830788368509e-06, "long_answer_loss": 0.0771, "loss": 0.0732, "short_answer_loss": NaN, "step": 793, "template_loss": 0.0 }, { "epoch": 1.28, "full_loss": 0.0605, "grad_norm": 1.6484375, "learning_rate": 7.497773212996623e-06, "long_answer_loss": 0.0605, "loss": 0.0793, "short_answer_loss": NaN, "step": 794, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0751, "grad_norm": 1.59375, "learning_rate": 7.467750036895657e-06, "long_answer_loss": 0.0751, "loss": 0.0742, "short_answer_loss": NaN, "step": 795, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0719, "grad_norm": 1.4921875, "learning_rate": 7.437761466528731e-06, "long_answer_loss": 0.0719, "loss": 0.0717, "short_answer_loss": NaN, "step": 796, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0624, "grad_norm": 1.5234375, "learning_rate": 7.407807708120998e-06, "long_answer_loss": 0.0624, "loss": 0.0686, "short_answer_loss": NaN, "step": 797, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0642, "grad_norm": 1.4140625, "learning_rate": 7.377888967658206e-06, "long_answer_loss": 0.0642, "loss": 0.0701, "short_answer_loss": NaN, "step": 798, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0696, "grad_norm": 1.5390625, "learning_rate": 7.348005450885301e-06, "long_answer_loss": 0.0696, "loss": 0.0692, "short_answer_loss": NaN, "step": 799, "template_loss": 0.0 }, { "epoch": 1.29, "full_loss": 0.0766, "grad_norm": 1.5234375, "learning_rate": 7.318157363304995e-06, "long_answer_loss": 0.0766, "loss": 0.071, "short_answer_loss": NaN, "step": 800, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0667, "grad_norm": 1.5390625, "learning_rate": 7.288344910176365e-06, "long_answer_loss": 0.0667, "loss": 0.0734, "short_answer_loss": NaN, "step": 801, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0701, "grad_norm": 1.5625, "learning_rate": 7.258568296513439e-06, "long_answer_loss": 0.0701, "loss": 0.0686, "short_answer_loss": NaN, "step": 802, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0903, "grad_norm": 1.6171875, "learning_rate": 7.228827727083781e-06, "long_answer_loss": 0.0903, "loss": 0.0759, "short_answer_loss": NaN, "step": 803, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0726, "grad_norm": 1.53125, "learning_rate": 7.199123406407089e-06, "long_answer_loss": 0.0726, "loss": 0.0688, "short_answer_loss": NaN, "step": 804, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0762, "grad_norm": 1.5, "learning_rate": 7.169455538753783e-06, "long_answer_loss": 0.0762, "loss": 0.0719, "short_answer_loss": NaN, "step": 805, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0821, "grad_norm": 1.453125, "learning_rate": 7.139824328143604e-06, "long_answer_loss": 0.0821, "loss": 0.0671, "short_answer_loss": NaN, "step": 806, "template_loss": 0.0 }, { "epoch": 1.3, "full_loss": 0.0701, "grad_norm": 1.4765625, "learning_rate": 7.110229978344212e-06, "long_answer_loss": 0.0701, "loss": 0.0714, "short_answer_loss": NaN, "step": 807, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0829, "grad_norm": 1.5625, "learning_rate": 7.080672692869783e-06, "long_answer_loss": 0.0829, "loss": 0.0677, "short_answer_loss": NaN, "step": 808, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0703, "grad_norm": 1.40625, "learning_rate": 7.051152674979608e-06, "long_answer_loss": 0.0703, "loss": 0.0672, "short_answer_loss": NaN, "step": 809, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0737, "grad_norm": 1.5078125, "learning_rate": 7.0216701276766936e-06, "long_answer_loss": 0.0737, "loss": 0.0744, "short_answer_loss": NaN, "step": 810, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0751, "grad_norm": 1.5546875, "learning_rate": 6.992225253706374e-06, "long_answer_loss": 0.0751, "loss": 0.0699, "short_answer_loss": NaN, "step": 811, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0662, "grad_norm": 1.625, "learning_rate": 6.962818255554911e-06, "long_answer_loss": 0.0662, "loss": 0.0747, "short_answer_loss": NaN, "step": 812, "template_loss": 0.0 }, { "epoch": 1.31, "full_loss": 0.0629, "grad_norm": 1.4765625, "learning_rate": 6.9334493354480985e-06, "long_answer_loss": 0.0629, "loss": 0.0684, "short_answer_loss": NaN, "step": 813, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0756, "grad_norm": 1.5234375, "learning_rate": 6.904118695349882e-06, "long_answer_loss": 0.0756, "loss": 0.0747, "short_answer_loss": NaN, "step": 814, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0698, "grad_norm": 1.5390625, "learning_rate": 6.874826536960954e-06, "long_answer_loss": 0.0698, "loss": 0.0741, "short_answer_loss": NaN, "step": 815, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0696, "grad_norm": 1.5078125, "learning_rate": 6.845573061717387e-06, "long_answer_loss": 0.0696, "loss": 0.0777, "short_answer_loss": NaN, "step": 816, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0788, "grad_norm": 1.5546875, "learning_rate": 6.8163584707892306e-06, "long_answer_loss": 0.0788, "loss": 0.0712, "short_answer_loss": NaN, "step": 817, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0925, "grad_norm": 1.6171875, "learning_rate": 6.7871829650791365e-06, "long_answer_loss": 0.0925, "loss": 0.0769, "short_answer_loss": NaN, "step": 818, "template_loss": 0.0 }, { "epoch": 1.32, "full_loss": 0.0659, "grad_norm": 1.5703125, "learning_rate": 6.758046745220978e-06, "long_answer_loss": 0.0659, "loss": 0.0697, "short_answer_loss": NaN, "step": 819, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.063, "grad_norm": 1.59375, "learning_rate": 6.728950011578462e-06, "long_answer_loss": 0.063, "loss": 0.0692, "short_answer_loss": NaN, "step": 820, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.0639, "grad_norm": 1.578125, "learning_rate": 6.6998929642437645e-06, "long_answer_loss": 0.0639, "loss": 0.0731, "short_answer_loss": NaN, "step": 821, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.0546, "grad_norm": 1.421875, "learning_rate": 6.670875803036141e-06, "long_answer_loss": 0.0546, "loss": 0.07, "short_answer_loss": NaN, "step": 822, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.0727, "grad_norm": 1.5625, "learning_rate": 6.64189872750056e-06, "long_answer_loss": 0.0727, "loss": 0.0686, "short_answer_loss": NaN, "step": 823, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.0826, "grad_norm": 1.4765625, "learning_rate": 6.612961936906333e-06, "long_answer_loss": 0.0826, "loss": 0.074, "short_answer_loss": NaN, "step": 824, "template_loss": 0.0 }, { "epoch": 1.33, "full_loss": 0.0692, "grad_norm": 1.4453125, "learning_rate": 6.584065630245734e-06, "long_answer_loss": 0.0692, "loss": 0.0688, "short_answer_loss": NaN, "step": 825, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.0577, "grad_norm": 1.4921875, "learning_rate": 6.55521000623264e-06, "long_answer_loss": 0.0577, "loss": 0.0671, "short_answer_loss": NaN, "step": 826, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.0838, "grad_norm": 1.515625, "learning_rate": 6.526395263301166e-06, "long_answer_loss": 0.0838, "loss": 0.0716, "short_answer_loss": NaN, "step": 827, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.0701, "grad_norm": 1.5078125, "learning_rate": 6.497621599604292e-06, "long_answer_loss": 0.0701, "loss": 0.0693, "short_answer_loss": NaN, "step": 828, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.082, "grad_norm": 1.5078125, "learning_rate": 6.468889213012502e-06, "long_answer_loss": 0.082, "loss": 0.0712, "short_answer_loss": NaN, "step": 829, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.0891, "grad_norm": 1.4375, "learning_rate": 6.440198301112434e-06, "long_answer_loss": 0.0891, "loss": 0.0723, "short_answer_loss": NaN, "step": 830, "template_loss": 0.0 }, { "epoch": 1.34, "full_loss": 0.0972, "grad_norm": 1.5859375, "learning_rate": 6.411549061205505e-06, "long_answer_loss": 0.0972, "loss": 0.0753, "short_answer_loss": NaN, "step": 831, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0513, "grad_norm": 1.5703125, "learning_rate": 6.382941690306568e-06, "long_answer_loss": 0.0513, "loss": 0.0653, "short_answer_loss": NaN, "step": 832, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0743, "grad_norm": 1.4765625, "learning_rate": 6.35437638514255e-06, "long_answer_loss": 0.0743, "loss": 0.0673, "short_answer_loss": NaN, "step": 833, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0729, "grad_norm": 1.390625, "learning_rate": 6.325853342151097e-06, "long_answer_loss": 0.0729, "loss": 0.0671, "short_answer_loss": NaN, "step": 834, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0751, "grad_norm": 1.6484375, "learning_rate": 6.2973727574792345e-06, "long_answer_loss": 0.0751, "loss": 0.0704, "short_answer_loss": NaN, "step": 835, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0665, "grad_norm": 1.6640625, "learning_rate": 6.2689348269820036e-06, "long_answer_loss": 0.0665, "loss": 0.0742, "short_answer_loss": NaN, "step": 836, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0658, "grad_norm": 1.5078125, "learning_rate": 6.240539746221127e-06, "long_answer_loss": 0.0658, "loss": 0.0703, "short_answer_loss": NaN, "step": 837, "template_loss": 0.0 }, { "epoch": 1.35, "full_loss": 0.0639, "grad_norm": 1.4375, "learning_rate": 6.212187710463654e-06, "long_answer_loss": 0.0639, "loss": 0.075, "short_answer_loss": NaN, "step": 838, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0708, "grad_norm": 1.5703125, "learning_rate": 6.1838789146806254e-06, "long_answer_loss": 0.0708, "loss": 0.0681, "short_answer_loss": NaN, "step": 839, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0712, "grad_norm": 1.5859375, "learning_rate": 6.155613553545729e-06, "long_answer_loss": 0.0712, "loss": 0.0749, "short_answer_loss": NaN, "step": 840, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0751, "grad_norm": 1.40625, "learning_rate": 6.127391821433961e-06, "long_answer_loss": 0.0751, "loss": 0.0717, "short_answer_loss": NaN, "step": 841, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0764, "grad_norm": 1.5546875, "learning_rate": 6.0992139124202914e-06, "long_answer_loss": 0.0764, "loss": 0.0735, "short_answer_loss": NaN, "step": 842, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0701, "grad_norm": 1.5703125, "learning_rate": 6.071080020278326e-06, "long_answer_loss": 0.0701, "loss": 0.0706, "short_answer_loss": NaN, "step": 843, "template_loss": 0.0 }, { "epoch": 1.36, "full_loss": 0.0805, "grad_norm": 1.609375, "learning_rate": 6.0429903384789775e-06, "long_answer_loss": 0.0805, "loss": 0.0728, "short_answer_loss": NaN, "step": 844, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0748, "grad_norm": 1.6640625, "learning_rate": 6.0149450601891325e-06, "long_answer_loss": 0.0748, "loss": 0.0817, "short_answer_loss": NaN, "step": 845, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0626, "grad_norm": 1.5390625, "learning_rate": 5.986944378270323e-06, "long_answer_loss": 0.0626, "loss": 0.0746, "short_answer_loss": NaN, "step": 846, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0844, "grad_norm": 1.578125, "learning_rate": 5.958988485277401e-06, "long_answer_loss": 0.0844, "loss": 0.0703, "short_answer_loss": NaN, "step": 847, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0873, "grad_norm": 1.71875, "learning_rate": 5.93107757345722e-06, "long_answer_loss": 0.0873, "loss": 0.0702, "short_answer_loss": NaN, "step": 848, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0692, "grad_norm": 1.453125, "learning_rate": 5.9032118347472965e-06, "long_answer_loss": 0.0692, "loss": 0.0692, "short_answer_loss": NaN, "step": 849, "template_loss": 0.0 }, { "epoch": 1.37, "full_loss": 0.0654, "grad_norm": 1.453125, "learning_rate": 5.87539146077451e-06, "long_answer_loss": 0.0654, "loss": 0.0667, "short_answer_loss": NaN, "step": 850, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0774, "grad_norm": 1.4765625, "learning_rate": 5.847616642853773e-06, "long_answer_loss": 0.0774, "loss": 0.069, "short_answer_loss": NaN, "step": 851, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0641, "grad_norm": 1.5234375, "learning_rate": 5.81988757198672e-06, "long_answer_loss": 0.0641, "loss": 0.0725, "short_answer_loss": NaN, "step": 852, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0771, "grad_norm": 1.5390625, "learning_rate": 5.792204438860391e-06, "long_answer_loss": 0.0771, "loss": 0.0728, "short_answer_loss": NaN, "step": 853, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0661, "grad_norm": 1.515625, "learning_rate": 5.764567433845915e-06, "long_answer_loss": 0.0661, "loss": 0.07, "short_answer_loss": NaN, "step": 854, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0721, "grad_norm": 1.5, "learning_rate": 5.736976746997226e-06, "long_answer_loss": 0.0721, "loss": 0.069, "short_answer_loss": NaN, "step": 855, "template_loss": 0.0 }, { "epoch": 1.38, "full_loss": 0.0943, "grad_norm": 1.453125, "learning_rate": 5.709432568049722e-06, "long_answer_loss": 0.0943, "loss": 0.073, "short_answer_loss": NaN, "step": 856, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0672, "grad_norm": 1.515625, "learning_rate": 5.681935086418978e-06, "long_answer_loss": 0.0672, "loss": 0.0715, "short_answer_loss": NaN, "step": 857, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0625, "grad_norm": 1.6328125, "learning_rate": 5.654484491199446e-06, "long_answer_loss": 0.0625, "loss": 0.0724, "short_answer_loss": NaN, "step": 858, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0597, "grad_norm": 1.5, "learning_rate": 5.627080971163146e-06, "long_answer_loss": 0.0597, "loss": 0.0674, "short_answer_loss": NaN, "step": 859, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0597, "grad_norm": 1.4921875, "learning_rate": 5.599724714758374e-06, "long_answer_loss": 0.0597, "loss": 0.0716, "short_answer_loss": NaN, "step": 860, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0699, "grad_norm": 1.515625, "learning_rate": 5.572415910108401e-06, "long_answer_loss": 0.0699, "loss": 0.0742, "short_answer_loss": NaN, "step": 861, "template_loss": 0.0 }, { "epoch": 1.39, "full_loss": 0.0782, "grad_norm": 1.515625, "learning_rate": 5.545154745010187e-06, "long_answer_loss": 0.0782, "loss": 0.073, "short_answer_loss": NaN, "step": 862, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.0752, "grad_norm": 1.4375, "learning_rate": 5.5179414069330786e-06, "long_answer_loss": 0.0752, "loss": 0.0714, "short_answer_loss": NaN, "step": 863, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.06, "grad_norm": 1.375, "learning_rate": 5.490776083017532e-06, "long_answer_loss": 0.06, "loss": 0.0691, "short_answer_loss": NaN, "step": 864, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.0703, "grad_norm": 1.5078125, "learning_rate": 5.463658960073816e-06, "long_answer_loss": 0.0703, "loss": 0.0703, "short_answer_loss": NaN, "step": 865, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.063, "grad_norm": 1.4140625, "learning_rate": 5.436590224580733e-06, "long_answer_loss": 0.063, "loss": 0.0685, "short_answer_loss": NaN, "step": 866, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.0656, "grad_norm": 1.4296875, "learning_rate": 5.409570062684334e-06, "long_answer_loss": 0.0656, "loss": 0.0685, "short_answer_loss": NaN, "step": 867, "template_loss": 0.0 }, { "epoch": 1.4, "full_loss": 0.0677, "grad_norm": 1.4921875, "learning_rate": 5.382598660196642e-06, "long_answer_loss": 0.0677, "loss": 0.0706, "short_answer_loss": NaN, "step": 868, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0569, "grad_norm": 1.4375, "learning_rate": 5.355676202594367e-06, "long_answer_loss": 0.0569, "loss": 0.0679, "short_answer_loss": NaN, "step": 869, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0708, "grad_norm": 1.5390625, "learning_rate": 5.3288028750176395e-06, "long_answer_loss": 0.0708, "loss": 0.0693, "short_answer_loss": NaN, "step": 870, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0767, "grad_norm": 1.4609375, "learning_rate": 5.301978862268733e-06, "long_answer_loss": 0.0767, "loss": 0.0732, "short_answer_loss": NaN, "step": 871, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0593, "grad_norm": 1.4375, "learning_rate": 5.275204348810789e-06, "long_answer_loss": 0.0593, "loss": 0.0681, "short_answer_loss": NaN, "step": 872, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0584, "grad_norm": 1.3984375, "learning_rate": 5.248479518766558e-06, "long_answer_loss": 0.0584, "loss": 0.0679, "short_answer_loss": NaN, "step": 873, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0605, "grad_norm": 1.390625, "learning_rate": 5.221804555917123e-06, "long_answer_loss": 0.0605, "loss": 0.0648, "short_answer_loss": NaN, "step": 874, "template_loss": 0.0 }, { "epoch": 1.41, "full_loss": 0.0637, "grad_norm": 1.4140625, "learning_rate": 5.195179643700646e-06, "long_answer_loss": 0.0637, "loss": 0.0661, "short_answer_loss": NaN, "step": 875, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.0591, "grad_norm": 1.5390625, "learning_rate": 5.168604965211096e-06, "long_answer_loss": 0.0591, "loss": 0.0699, "short_answer_loss": NaN, "step": 876, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.0751, "grad_norm": 1.5234375, "learning_rate": 5.142080703197e-06, "long_answer_loss": 0.0751, "loss": 0.0725, "short_answer_loss": NaN, "step": 877, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.0624, "grad_norm": 1.4765625, "learning_rate": 5.115607040060177e-06, "long_answer_loss": 0.0624, "loss": 0.0733, "short_answer_loss": NaN, "step": 878, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.0758, "grad_norm": 1.4375, "learning_rate": 5.089184157854491e-06, "long_answer_loss": 0.0758, "loss": 0.0685, "short_answer_loss": NaN, "step": 879, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.06, "grad_norm": 1.578125, "learning_rate": 5.0628122382845935e-06, "long_answer_loss": 0.06, "loss": 0.0703, "short_answer_loss": NaN, "step": 880, "template_loss": 0.0 }, { "epoch": 1.42, "full_loss": 0.069, "grad_norm": 1.4765625, "learning_rate": 5.036491462704682e-06, "long_answer_loss": 0.069, "loss": 0.077, "short_answer_loss": NaN, "step": 881, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.0749, "grad_norm": 1.5625, "learning_rate": 5.010222012117238e-06, "long_answer_loss": 0.0749, "loss": 0.0736, "short_answer_loss": NaN, "step": 882, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.0579, "grad_norm": 1.5234375, "learning_rate": 4.984004067171803e-06, "long_answer_loss": 0.0579, "loss": 0.0701, "short_answer_loss": NaN, "step": 883, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.0791, "grad_norm": 1.4140625, "learning_rate": 4.957837808163718e-06, "long_answer_loss": 0.0791, "loss": 0.0688, "short_answer_loss": NaN, "step": 884, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.0586, "grad_norm": 1.546875, "learning_rate": 4.931723415032889e-06, "long_answer_loss": 0.0586, "loss": 0.0723, "short_answer_loss": NaN, "step": 885, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.0948, "grad_norm": 1.46875, "learning_rate": 4.905661067362558e-06, "long_answer_loss": 0.0948, "loss": 0.0772, "short_answer_loss": NaN, "step": 886, "template_loss": 0.0 }, { "epoch": 1.43, "full_loss": 0.072, "grad_norm": 1.4140625, "learning_rate": 4.87965094437805e-06, "long_answer_loss": 0.072, "loss": 0.0702, "short_answer_loss": NaN, "step": 887, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.0702, "grad_norm": 1.53125, "learning_rate": 4.853693224945569e-06, "long_answer_loss": 0.0702, "loss": 0.0729, "short_answer_loss": NaN, "step": 888, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.0709, "grad_norm": 1.3671875, "learning_rate": 4.827788087570936e-06, "long_answer_loss": 0.0709, "loss": 0.0689, "short_answer_loss": NaN, "step": 889, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.0619, "grad_norm": 1.4609375, "learning_rate": 4.801935710398382e-06, "long_answer_loss": 0.0619, "loss": 0.0666, "short_answer_loss": NaN, "step": 890, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.0726, "grad_norm": 1.4375, "learning_rate": 4.776136271209315e-06, "long_answer_loss": 0.0726, "loss": 0.0689, "short_answer_loss": NaN, "step": 891, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.074, "grad_norm": 1.5390625, "learning_rate": 4.750389947421101e-06, "long_answer_loss": 0.074, "loss": 0.075, "short_answer_loss": NaN, "step": 892, "template_loss": 0.0 }, { "epoch": 1.44, "full_loss": 0.0644, "grad_norm": 1.4375, "learning_rate": 4.724696916085841e-06, "long_answer_loss": 0.0644, "loss": 0.0693, "short_answer_loss": NaN, "step": 893, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0673, "grad_norm": 1.4296875, "learning_rate": 4.699057353889157e-06, "long_answer_loss": 0.0673, "loss": 0.0686, "short_answer_loss": NaN, "step": 894, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0895, "grad_norm": 1.5078125, "learning_rate": 4.673471437148973e-06, "long_answer_loss": 0.0895, "loss": 0.0745, "short_answer_loss": NaN, "step": 895, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0645, "grad_norm": 1.3828125, "learning_rate": 4.64793934181431e-06, "long_answer_loss": 0.0645, "loss": 0.0694, "short_answer_loss": NaN, "step": 896, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0683, "grad_norm": 1.34375, "learning_rate": 4.6224612434640575e-06, "long_answer_loss": 0.0683, "loss": 0.0634, "short_answer_loss": NaN, "step": 897, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0634, "grad_norm": 1.4453125, "learning_rate": 4.597037317305788e-06, "long_answer_loss": 0.0634, "loss": 0.0654, "short_answer_loss": NaN, "step": 898, "template_loss": 0.0 }, { "epoch": 1.45, "full_loss": 0.0683, "grad_norm": 1.453125, "learning_rate": 4.571667738174547e-06, "long_answer_loss": 0.0683, "loss": 0.0709, "short_answer_loss": NaN, "step": 899, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0723, "grad_norm": 1.5625, "learning_rate": 4.546352680531639e-06, "long_answer_loss": 0.0723, "loss": 0.0689, "short_answer_loss": NaN, "step": 900, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0885, "grad_norm": 1.5078125, "learning_rate": 4.521092318463439e-06, "long_answer_loss": 0.0885, "loss": 0.0696, "short_answer_loss": NaN, "step": 901, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0772, "grad_norm": 1.421875, "learning_rate": 4.495886825680192e-06, "long_answer_loss": 0.0772, "loss": 0.0681, "short_answer_loss": NaN, "step": 902, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0694, "grad_norm": 1.6171875, "learning_rate": 4.470736375514818e-06, "long_answer_loss": 0.0694, "loss": 0.0776, "short_answer_loss": NaN, "step": 903, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0899, "grad_norm": 1.5, "learning_rate": 4.445641140921721e-06, "long_answer_loss": 0.0899, "loss": 0.068, "short_answer_loss": NaN, "step": 904, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0637, "grad_norm": 1.5, "learning_rate": 4.420601294475595e-06, "long_answer_loss": 0.0637, "loss": 0.0728, "short_answer_loss": NaN, "step": 905, "template_loss": 0.0 }, { "epoch": 1.46, "full_loss": 0.0602, "grad_norm": 1.375, "learning_rate": 4.395617008370248e-06, "long_answer_loss": 0.0602, "loss": 0.0677, "short_answer_loss": NaN, "step": 906, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0693, "grad_norm": 1.4921875, "learning_rate": 4.370688454417405e-06, "long_answer_loss": 0.0693, "loss": 0.0706, "short_answer_loss": NaN, "step": 907, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0855, "grad_norm": 1.4921875, "learning_rate": 4.345815804045539e-06, "long_answer_loss": 0.0855, "loss": 0.0667, "short_answer_loss": NaN, "step": 908, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0583, "grad_norm": 1.3828125, "learning_rate": 4.320999228298678e-06, "long_answer_loss": 0.0583, "loss": 0.064, "short_answer_loss": NaN, "step": 909, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0899, "grad_norm": 1.4765625, "learning_rate": 4.2962388978352435e-06, "long_answer_loss": 0.0899, "loss": 0.0693, "short_answer_loss": NaN, "step": 910, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0773, "grad_norm": 1.515625, "learning_rate": 4.271534982926864e-06, "long_answer_loss": 0.0773, "loss": 0.0704, "short_answer_loss": NaN, "step": 911, "template_loss": 0.0 }, { "epoch": 1.47, "full_loss": 0.0494, "grad_norm": 1.421875, "learning_rate": 4.246887653457216e-06, "long_answer_loss": 0.0494, "loss": 0.0607, "short_answer_loss": NaN, "step": 912, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0653, "grad_norm": 1.4296875, "learning_rate": 4.222297078920845e-06, "long_answer_loss": 0.0653, "loss": 0.0719, "short_answer_loss": NaN, "step": 913, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0765, "grad_norm": 1.453125, "learning_rate": 4.197763428422005e-06, "long_answer_loss": 0.0765, "loss": 0.0679, "short_answer_loss": NaN, "step": 914, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0643, "grad_norm": 1.4453125, "learning_rate": 4.173286870673498e-06, "long_answer_loss": 0.0643, "loss": 0.067, "short_answer_loss": NaN, "step": 915, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0814, "grad_norm": 1.5078125, "learning_rate": 4.148867573995511e-06, "long_answer_loss": 0.0814, "loss": 0.0765, "short_answer_loss": NaN, "step": 916, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0628, "grad_norm": 1.4453125, "learning_rate": 4.124505706314455e-06, "long_answer_loss": 0.0628, "loss": 0.0687, "short_answer_loss": NaN, "step": 917, "template_loss": 0.0 }, { "epoch": 1.48, "full_loss": 0.0651, "grad_norm": 1.625, "learning_rate": 4.100201435161817e-06, "long_answer_loss": 0.0651, "loss": 0.0749, "short_answer_loss": NaN, "step": 918, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.089, "grad_norm": 1.375, "learning_rate": 4.0759549276730025e-06, "long_answer_loss": 0.089, "loss": 0.0727, "short_answer_loss": NaN, "step": 919, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.0845, "grad_norm": 1.453125, "learning_rate": 4.051766350586187e-06, "long_answer_loss": 0.0845, "loss": 0.0733, "short_answer_loss": NaN, "step": 920, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.0693, "grad_norm": 1.421875, "learning_rate": 4.027635870241178e-06, "long_answer_loss": 0.0693, "loss": 0.0664, "short_answer_loss": NaN, "step": 921, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.0697, "grad_norm": 1.484375, "learning_rate": 4.003563652578258e-06, "long_answer_loss": 0.0697, "loss": 0.07, "short_answer_loss": NaN, "step": 922, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.0664, "grad_norm": 1.453125, "learning_rate": 3.9795498631370515e-06, "long_answer_loss": 0.0664, "loss": 0.0751, "short_answer_loss": NaN, "step": 923, "template_loss": 0.0 }, { "epoch": 1.49, "full_loss": 0.0788, "grad_norm": 1.4921875, "learning_rate": 3.9555946670553774e-06, "long_answer_loss": 0.0788, "loss": 0.0727, "short_answer_loss": NaN, "step": 924, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0861, "grad_norm": 1.5, "learning_rate": 3.931698229068131e-06, "long_answer_loss": 0.0861, "loss": 0.0738, "short_answer_loss": NaN, "step": 925, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0731, "grad_norm": 1.4609375, "learning_rate": 3.907860713506132e-06, "long_answer_loss": 0.0731, "loss": 0.0671, "short_answer_loss": NaN, "step": 926, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0647, "grad_norm": 1.453125, "learning_rate": 3.884082284295008e-06, "long_answer_loss": 0.0647, "loss": 0.072, "short_answer_loss": NaN, "step": 927, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0623, "grad_norm": 1.4296875, "learning_rate": 3.860363104954059e-06, "long_answer_loss": 0.0623, "loss": 0.0685, "short_answer_loss": NaN, "step": 928, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0813, "grad_norm": 1.4140625, "learning_rate": 3.836703338595138e-06, "long_answer_loss": 0.0813, "loss": 0.0702, "short_answer_loss": NaN, "step": 929, "template_loss": 0.0 }, { "epoch": 1.5, "full_loss": 0.0863, "grad_norm": 1.5625, "learning_rate": 3.813103147921526e-06, "long_answer_loss": 0.0863, "loss": 0.0722, "short_answer_loss": NaN, "step": 930, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0466, "grad_norm": 1.421875, "learning_rate": 3.7895626952268155e-06, "long_answer_loss": 0.0466, "loss": 0.0638, "short_answer_loss": NaN, "step": 931, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0831, "grad_norm": 1.484375, "learning_rate": 3.766082142393791e-06, "long_answer_loss": 0.0831, "loss": 0.0741, "short_answer_loss": NaN, "step": 932, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0468, "grad_norm": 1.484375, "learning_rate": 3.7426616508933214e-06, "long_answer_loss": 0.0468, "loss": 0.0678, "short_answer_loss": NaN, "step": 933, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0818, "grad_norm": 1.40625, "learning_rate": 3.7193013817832454e-06, "long_answer_loss": 0.0818, "loss": 0.0737, "short_answer_loss": NaN, "step": 934, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0733, "grad_norm": 1.484375, "learning_rate": 3.696001495707263e-06, "long_answer_loss": 0.0733, "loss": 0.0689, "short_answer_loss": NaN, "step": 935, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.067, "grad_norm": 1.40625, "learning_rate": 3.672762152893834e-06, "long_answer_loss": 0.067, "loss": 0.0668, "short_answer_loss": NaN, "step": 936, "template_loss": 0.0 }, { "epoch": 1.51, "full_loss": 0.0583, "grad_norm": 1.53125, "learning_rate": 3.6495835131550748e-06, "long_answer_loss": 0.0583, "loss": 0.0686, "short_answer_loss": NaN, "step": 937, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.0655, "grad_norm": 1.4609375, "learning_rate": 3.6264657358856604e-06, "long_answer_loss": 0.0655, "loss": 0.067, "short_answer_loss": NaN, "step": 938, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.0497, "grad_norm": 1.4296875, "learning_rate": 3.603408980061726e-06, "long_answer_loss": 0.0497, "loss": 0.0665, "short_answer_loss": NaN, "step": 939, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.0841, "grad_norm": 1.3984375, "learning_rate": 3.5804134042397743e-06, "long_answer_loss": 0.0841, "loss": 0.0685, "short_answer_loss": NaN, "step": 940, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.0685, "grad_norm": 1.3984375, "learning_rate": 3.5574791665555882e-06, "long_answer_loss": 0.0685, "loss": 0.0672, "short_answer_loss": NaN, "step": 941, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.071, "grad_norm": 1.484375, "learning_rate": 3.5346064247231387e-06, "long_answer_loss": 0.071, "loss": 0.0696, "short_answer_loss": NaN, "step": 942, "template_loss": 0.0 }, { "epoch": 1.52, "full_loss": 0.059, "grad_norm": 1.4609375, "learning_rate": 3.511795336033505e-06, "long_answer_loss": 0.059, "loss": 0.0691, "short_answer_loss": NaN, "step": 943, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.0613, "grad_norm": 1.375, "learning_rate": 3.489046057353787e-06, "long_answer_loss": 0.0613, "loss": 0.0661, "short_answer_loss": NaN, "step": 944, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.0751, "grad_norm": 1.59375, "learning_rate": 3.466358745126033e-06, "long_answer_loss": 0.0751, "loss": 0.071, "short_answer_loss": NaN, "step": 945, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.0732, "grad_norm": 1.5546875, "learning_rate": 3.4437335553661605e-06, "long_answer_loss": 0.0732, "loss": 0.0735, "short_answer_loss": NaN, "step": 946, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.0718, "grad_norm": 1.4765625, "learning_rate": 3.421170643662884e-06, "long_answer_loss": 0.0718, "loss": 0.0737, "short_answer_loss": NaN, "step": 947, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.0718, "grad_norm": 1.421875, "learning_rate": 3.3986701651766426e-06, "long_answer_loss": 0.0718, "loss": 0.0735, "short_answer_loss": NaN, "step": 948, "template_loss": 0.0 }, { "epoch": 1.53, "full_loss": 0.071, "grad_norm": 1.453125, "learning_rate": 3.3762322746385383e-06, "long_answer_loss": 0.071, "loss": 0.0695, "short_answer_loss": NaN, "step": 949, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.0776, "grad_norm": 1.53125, "learning_rate": 3.353857126349265e-06, "long_answer_loss": 0.0776, "loss": 0.0751, "short_answer_loss": NaN, "step": 950, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.0624, "grad_norm": 1.5625, "learning_rate": 3.3315448741780566e-06, "long_answer_loss": 0.0624, "loss": 0.0659, "short_answer_loss": NaN, "step": 951, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.0799, "grad_norm": 1.359375, "learning_rate": 3.309295671561617e-06, "long_answer_loss": 0.0799, "loss": 0.0671, "short_answer_loss": NaN, "step": 952, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.079, "grad_norm": 1.390625, "learning_rate": 3.287109671503079e-06, "long_answer_loss": 0.079, "loss": 0.0684, "short_answer_loss": NaN, "step": 953, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.0607, "grad_norm": 1.390625, "learning_rate": 3.2649870265709314e-06, "long_answer_loss": 0.0607, "loss": 0.0692, "short_answer_loss": NaN, "step": 954, "template_loss": 0.0 }, { "epoch": 1.54, "full_loss": 0.0714, "grad_norm": 1.4296875, "learning_rate": 3.2429278888980034e-06, "long_answer_loss": 0.0714, "loss": 0.0761, "short_answer_loss": NaN, "step": 955, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.0825, "grad_norm": 1.515625, "learning_rate": 3.220932410180383e-06, "long_answer_loss": 0.0825, "loss": 0.0697, "short_answer_loss": NaN, "step": 956, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.088, "grad_norm": 1.5, "learning_rate": 3.1990007416763904e-06, "long_answer_loss": 0.088, "loss": 0.0716, "short_answer_loss": NaN, "step": 957, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.0535, "grad_norm": 1.375, "learning_rate": 3.1771330342055387e-06, "long_answer_loss": 0.0535, "loss": 0.0677, "short_answer_loss": NaN, "step": 958, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.0861, "grad_norm": 1.46875, "learning_rate": 3.1553294381474946e-06, "long_answer_loss": 0.0861, "loss": 0.0702, "short_answer_loss": NaN, "step": 959, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.0687, "grad_norm": 1.3984375, "learning_rate": 3.133590103441042e-06, "long_answer_loss": 0.0687, "loss": 0.069, "short_answer_loss": NaN, "step": 960, "template_loss": 0.0 }, { "epoch": 1.55, "full_loss": 0.066, "grad_norm": 1.3515625, "learning_rate": 3.1119151795830567e-06, "long_answer_loss": 0.066, "loss": 0.0673, "short_answer_loss": NaN, "step": 961, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.0574, "grad_norm": 1.4453125, "learning_rate": 3.0903048156274707e-06, "long_answer_loss": 0.0574, "loss": 0.0683, "short_answer_loss": NaN, "step": 962, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.068, "grad_norm": 1.4609375, "learning_rate": 3.0687591601842524e-06, "long_answer_loss": 0.068, "loss": 0.0683, "short_answer_loss": NaN, "step": 963, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.056, "grad_norm": 1.375, "learning_rate": 3.047278361418382e-06, "long_answer_loss": 0.056, "loss": 0.0713, "short_answer_loss": NaN, "step": 964, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.0583, "grad_norm": 1.4453125, "learning_rate": 3.0258625670488373e-06, "long_answer_loss": 0.0583, "loss": 0.0677, "short_answer_loss": NaN, "step": 965, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.0657, "grad_norm": 1.515625, "learning_rate": 3.0045119243475696e-06, "long_answer_loss": 0.0657, "loss": 0.0694, "short_answer_loss": NaN, "step": 966, "template_loss": 0.0 }, { "epoch": 1.56, "full_loss": 0.0676, "grad_norm": 1.4140625, "learning_rate": 2.9832265801385e-06, "long_answer_loss": 0.0676, "loss": 0.0658, "short_answer_loss": NaN, "step": 967, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0581, "grad_norm": 1.640625, "learning_rate": 2.962006680796503e-06, "long_answer_loss": 0.0581, "loss": 0.0727, "short_answer_loss": NaN, "step": 968, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.093, "grad_norm": 1.375, "learning_rate": 2.940852372246404e-06, "long_answer_loss": 0.093, "loss": 0.062, "short_answer_loss": NaN, "step": 969, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0712, "grad_norm": 1.546875, "learning_rate": 2.9197637999619733e-06, "long_answer_loss": 0.0712, "loss": 0.0711, "short_answer_loss": NaN, "step": 970, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0704, "grad_norm": 1.4765625, "learning_rate": 2.898741108964925e-06, "long_answer_loss": 0.0704, "loss": 0.0712, "short_answer_loss": NaN, "step": 971, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0738, "grad_norm": 1.3984375, "learning_rate": 2.877784443823926e-06, "long_answer_loss": 0.0738, "loss": 0.068, "short_answer_loss": NaN, "step": 972, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0691, "grad_norm": 1.5625, "learning_rate": 2.856893948653591e-06, "long_answer_loss": 0.0691, "loss": 0.0727, "short_answer_loss": NaN, "step": 973, "template_loss": 0.0 }, { "epoch": 1.57, "full_loss": 0.0732, "grad_norm": 1.421875, "learning_rate": 2.836069767113503e-06, "long_answer_loss": 0.0732, "loss": 0.0667, "short_answer_loss": NaN, "step": 974, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.0636, "grad_norm": 1.4765625, "learning_rate": 2.8153120424072156e-06, "long_answer_loss": 0.0636, "loss": 0.0707, "short_answer_loss": NaN, "step": 975, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.087, "grad_norm": 1.453125, "learning_rate": 2.794620917281278e-06, "long_answer_loss": 0.087, "loss": 0.0719, "short_answer_loss": NaN, "step": 976, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.0716, "grad_norm": 1.3046875, "learning_rate": 2.773996534024241e-06, "long_answer_loss": 0.0716, "loss": 0.0605, "short_answer_loss": NaN, "step": 977, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.0541, "grad_norm": 1.421875, "learning_rate": 2.753439034465695e-06, "long_answer_loss": 0.0541, "loss": 0.0657, "short_answer_loss": NaN, "step": 978, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.0486, "grad_norm": 1.3671875, "learning_rate": 2.732948559975271e-06, "long_answer_loss": 0.0486, "loss": 0.062, "short_answer_loss": NaN, "step": 979, "template_loss": 0.0 }, { "epoch": 1.58, "full_loss": 0.0765, "grad_norm": 1.4296875, "learning_rate": 2.7125252514616966e-06, "long_answer_loss": 0.0765, "loss": 0.0697, "short_answer_loss": NaN, "step": 980, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.0601, "grad_norm": 1.3984375, "learning_rate": 2.692169249371804e-06, "long_answer_loss": 0.0601, "loss": 0.0653, "short_answer_loss": NaN, "step": 981, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.0817, "grad_norm": 1.375, "learning_rate": 2.6718806936895796e-06, "long_answer_loss": 0.0817, "loss": 0.0702, "short_answer_loss": NaN, "step": 982, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.066, "grad_norm": 1.4453125, "learning_rate": 2.651659723935189e-06, "long_answer_loss": 0.066, "loss": 0.0659, "short_answer_loss": NaN, "step": 983, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.0578, "grad_norm": 1.4609375, "learning_rate": 2.6315064791640296e-06, "long_answer_loss": 0.0578, "loss": 0.0668, "short_answer_loss": NaN, "step": 984, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.0679, "grad_norm": 1.6796875, "learning_rate": 2.6114210979657606e-06, "long_answer_loss": 0.0679, "loss": 0.0746, "short_answer_loss": NaN, "step": 985, "template_loss": 0.0 }, { "epoch": 1.59, "full_loss": 0.064, "grad_norm": 1.4296875, "learning_rate": 2.5914037184633656e-06, "long_answer_loss": 0.064, "loss": 0.064, "short_answer_loss": NaN, "step": 986, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0693, "grad_norm": 1.453125, "learning_rate": 2.571454478312185e-06, "long_answer_loss": 0.0693, "loss": 0.0659, "short_answer_loss": NaN, "step": 987, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0723, "grad_norm": 1.46875, "learning_rate": 2.5515735146989933e-06, "long_answer_loss": 0.0723, "loss": 0.0696, "short_answer_loss": NaN, "step": 988, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0792, "grad_norm": 1.5703125, "learning_rate": 2.531760964341029e-06, "long_answer_loss": 0.0792, "loss": 0.0716, "short_answer_loss": NaN, "step": 989, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0577, "grad_norm": 1.5234375, "learning_rate": 2.5120169634850713e-06, "long_answer_loss": 0.0577, "loss": 0.0652, "short_answer_loss": NaN, "step": 990, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0653, "grad_norm": 1.3984375, "learning_rate": 2.4923416479064987e-06, "long_answer_loss": 0.0653, "loss": 0.064, "short_answer_loss": NaN, "step": 991, "template_loss": 0.0 }, { "epoch": 1.6, "full_loss": 0.0761, "grad_norm": 1.421875, "learning_rate": 2.4727351529083536e-06, "long_answer_loss": 0.0761, "loss": 0.0687, "short_answer_loss": NaN, "step": 992, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0722, "grad_norm": 1.34375, "learning_rate": 2.4531976133204184e-06, "long_answer_loss": 0.0722, "loss": 0.0642, "short_answer_loss": NaN, "step": 993, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0715, "grad_norm": 1.34375, "learning_rate": 2.4337291634982757e-06, "long_answer_loss": 0.0715, "loss": 0.0659, "short_answer_loss": NaN, "step": 994, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0635, "grad_norm": 1.421875, "learning_rate": 2.4143299373224015e-06, "long_answer_loss": 0.0635, "loss": 0.0723, "short_answer_loss": NaN, "step": 995, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0864, "grad_norm": 1.4609375, "learning_rate": 2.3950000681972284e-06, "long_answer_loss": 0.0864, "loss": 0.0685, "short_answer_loss": NaN, "step": 996, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0716, "grad_norm": 1.4375, "learning_rate": 2.3757396890502382e-06, "long_answer_loss": 0.0716, "loss": 0.0719, "short_answer_loss": NaN, "step": 997, "template_loss": 0.0 }, { "epoch": 1.61, "full_loss": 0.0498, "grad_norm": 1.3828125, "learning_rate": 2.3565489323310402e-06, "long_answer_loss": 0.0498, "loss": 0.0667, "short_answer_loss": NaN, "step": 998, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0565, "grad_norm": 1.4296875, "learning_rate": 2.3374279300104733e-06, "long_answer_loss": 0.0565, "loss": 0.0625, "short_answer_loss": NaN, "step": 999, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0556, "grad_norm": 1.4375, "learning_rate": 2.31837681357968e-06, "long_answer_loss": 0.0556, "loss": 0.0653, "short_answer_loss": NaN, "step": 1000, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0831, "grad_norm": 1.5078125, "learning_rate": 2.2993957140492197e-06, "long_answer_loss": 0.0831, "loss": 0.067, "short_answer_loss": NaN, "step": 1001, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0598, "grad_norm": 1.4140625, "learning_rate": 2.2804847619481552e-06, "long_answer_loss": 0.0598, "loss": 0.0646, "short_answer_loss": NaN, "step": 1002, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0641, "grad_norm": 1.6171875, "learning_rate": 2.2616440873231655e-06, "long_answer_loss": 0.0641, "loss": 0.0697, "short_answer_loss": NaN, "step": 1003, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0806, "grad_norm": 1.5078125, "learning_rate": 2.2428738197376397e-06, "long_answer_loss": 0.0806, "loss": 0.067, "short_answer_loss": NaN, "step": 1004, "template_loss": 0.0 }, { "epoch": 1.62, "full_loss": 0.0572, "grad_norm": 1.3984375, "learning_rate": 2.224174088270796e-06, "long_answer_loss": 0.0572, "loss": 0.0665, "short_answer_loss": NaN, "step": 1005, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0618, "grad_norm": 1.53125, "learning_rate": 2.20554502151679e-06, "long_answer_loss": 0.0618, "loss": 0.0705, "short_answer_loss": NaN, "step": 1006, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0638, "grad_norm": 1.4375, "learning_rate": 2.1869867475838317e-06, "long_answer_loss": 0.0638, "loss": 0.0702, "short_answer_loss": NaN, "step": 1007, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0662, "grad_norm": 1.390625, "learning_rate": 2.1684993940933013e-06, "long_answer_loss": 0.0662, "loss": 0.0692, "short_answer_loss": NaN, "step": 1008, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0761, "grad_norm": 1.4765625, "learning_rate": 2.150083088178875e-06, "long_answer_loss": 0.0761, "loss": 0.0658, "short_answer_loss": NaN, "step": 1009, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0672, "grad_norm": 1.546875, "learning_rate": 2.131737956485652e-06, "long_answer_loss": 0.0672, "loss": 0.0746, "short_answer_loss": NaN, "step": 1010, "template_loss": 0.0 }, { "epoch": 1.63, "full_loss": 0.0695, "grad_norm": 1.4140625, "learning_rate": 2.113464125169276e-06, "long_answer_loss": 0.0695, "loss": 0.0657, "short_answer_loss": NaN, "step": 1011, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0669, "grad_norm": 1.4296875, "learning_rate": 2.0952617198950765e-06, "long_answer_loss": 0.0669, "loss": 0.0671, "short_answer_loss": NaN, "step": 1012, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0679, "grad_norm": 1.4921875, "learning_rate": 2.0771308658372015e-06, "long_answer_loss": 0.0679, "loss": 0.0685, "short_answer_loss": NaN, "step": 1013, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0656, "grad_norm": 1.5, "learning_rate": 2.059071687677755e-06, "long_answer_loss": 0.0656, "loss": 0.0698, "short_answer_loss": NaN, "step": 1014, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0591, "grad_norm": 1.515625, "learning_rate": 2.0410843096059394e-06, "long_answer_loss": 0.0591, "loss": 0.068, "short_answer_loss": NaN, "step": 1015, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0654, "grad_norm": 1.5625, "learning_rate": 2.0231688553172064e-06, "long_answer_loss": 0.0654, "loss": 0.0705, "short_answer_loss": NaN, "step": 1016, "template_loss": 0.0 }, { "epoch": 1.64, "full_loss": 0.0745, "grad_norm": 1.5234375, "learning_rate": 2.0053254480123977e-06, "long_answer_loss": 0.0745, "loss": 0.0734, "short_answer_loss": NaN, "step": 1017, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.0657, "grad_norm": 1.4453125, "learning_rate": 1.9875542103969094e-06, "long_answer_loss": 0.0657, "loss": 0.0708, "short_answer_loss": NaN, "step": 1018, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.0704, "grad_norm": 1.46875, "learning_rate": 1.969855264679836e-06, "long_answer_loss": 0.0704, "loss": 0.0684, "short_answer_loss": NaN, "step": 1019, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.07, "grad_norm": 1.4375, "learning_rate": 1.9522287325731357e-06, "long_answer_loss": 0.07, "loss": 0.0666, "short_answer_loss": NaN, "step": 1020, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.061, "grad_norm": 1.3984375, "learning_rate": 1.934674735290802e-06, "long_answer_loss": 0.061, "loss": 0.0726, "short_answer_loss": NaN, "step": 1021, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.0564, "grad_norm": 1.4765625, "learning_rate": 1.91719339354801e-06, "long_answer_loss": 0.0564, "loss": 0.0674, "short_answer_loss": NaN, "step": 1022, "template_loss": 0.0 }, { "epoch": 1.65, "full_loss": 0.0543, "grad_norm": 1.453125, "learning_rate": 1.8997848275603067e-06, "long_answer_loss": 0.0543, "loss": 0.066, "short_answer_loss": NaN, "step": 1023, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0813, "grad_norm": 1.4921875, "learning_rate": 1.8824491570427676e-06, "long_answer_loss": 0.0813, "loss": 0.0722, "short_answer_loss": NaN, "step": 1024, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0644, "grad_norm": 1.390625, "learning_rate": 1.8651865012091888e-06, "long_answer_loss": 0.0644, "loss": 0.0631, "short_answer_loss": NaN, "step": 1025, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0845, "grad_norm": 1.3671875, "learning_rate": 1.8479969787712533e-06, "long_answer_loss": 0.0845, "loss": 0.0698, "short_answer_loss": NaN, "step": 1026, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0604, "grad_norm": 1.6953125, "learning_rate": 1.830880707937725e-06, "long_answer_loss": 0.0604, "loss": 0.0776, "short_answer_loss": NaN, "step": 1027, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0618, "grad_norm": 1.5078125, "learning_rate": 1.8138378064136318e-06, "long_answer_loss": 0.0618, "loss": 0.068, "short_answer_loss": NaN, "step": 1028, "template_loss": 0.0 }, { "epoch": 1.66, "full_loss": 0.0637, "grad_norm": 1.4140625, "learning_rate": 1.796868391399452e-06, "long_answer_loss": 0.0637, "loss": 0.0703, "short_answer_loss": NaN, "step": 1029, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0725, "grad_norm": 1.5625, "learning_rate": 1.7799725795903193e-06, "long_answer_loss": 0.0725, "loss": 0.0694, "short_answer_loss": NaN, "step": 1030, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0685, "grad_norm": 1.390625, "learning_rate": 1.7631504871752066e-06, "long_answer_loss": 0.0685, "loss": 0.0648, "short_answer_loss": NaN, "step": 1031, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0777, "grad_norm": 1.4375, "learning_rate": 1.7464022298361374e-06, "long_answer_loss": 0.0777, "loss": 0.0698, "short_answer_loss": NaN, "step": 1032, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0717, "grad_norm": 1.4375, "learning_rate": 1.7297279227473874e-06, "long_answer_loss": 0.0717, "loss": 0.0673, "short_answer_loss": NaN, "step": 1033, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0736, "grad_norm": 1.515625, "learning_rate": 1.7131276805746902e-06, "long_answer_loss": 0.0736, "loss": 0.072, "short_answer_loss": NaN, "step": 1034, "template_loss": 0.0 }, { "epoch": 1.67, "full_loss": 0.0637, "grad_norm": 1.421875, "learning_rate": 1.6966016174744499e-06, "long_answer_loss": 0.0637, "loss": 0.0688, "short_answer_loss": NaN, "step": 1035, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.0729, "grad_norm": 1.453125, "learning_rate": 1.68014984709296e-06, "long_answer_loss": 0.0729, "loss": 0.0668, "short_answer_loss": NaN, "step": 1036, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.0574, "grad_norm": 1.3828125, "learning_rate": 1.6637724825656147e-06, "long_answer_loss": 0.0574, "loss": 0.0669, "short_answer_loss": NaN, "step": 1037, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.0873, "grad_norm": 1.40625, "learning_rate": 1.6474696365161358e-06, "long_answer_loss": 0.0873, "loss": 0.0667, "short_answer_loss": NaN, "step": 1038, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.064, "grad_norm": 1.53125, "learning_rate": 1.6312414210557972e-06, "long_answer_loss": 0.064, "loss": 0.0688, "short_answer_loss": NaN, "step": 1039, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.0635, "grad_norm": 1.4609375, "learning_rate": 1.615087947782655e-06, "long_answer_loss": 0.0635, "loss": 0.0644, "short_answer_loss": NaN, "step": 1040, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.073, "grad_norm": 1.4296875, "learning_rate": 1.5990093277807775e-06, "long_answer_loss": 0.073, "loss": 0.0657, "short_answer_loss": NaN, "step": 1041, "template_loss": 0.0 }, { "epoch": 1.68, "full_loss": 0.0586, "grad_norm": 1.40625, "learning_rate": 1.583005671619482e-06, "long_answer_loss": 0.0586, "loss": 0.0662, "short_answer_loss": NaN, "step": 1042, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0633, "grad_norm": 1.4375, "learning_rate": 1.5670770893525768e-06, "long_answer_loss": 0.0633, "loss": 0.0697, "short_answer_loss": NaN, "step": 1043, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0764, "grad_norm": 1.4375, "learning_rate": 1.5512236905176018e-06, "long_answer_loss": 0.0764, "loss": 0.0657, "short_answer_loss": NaN, "step": 1044, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0818, "grad_norm": 1.515625, "learning_rate": 1.5354455841350756e-06, "long_answer_loss": 0.0818, "loss": 0.0619, "short_answer_loss": NaN, "step": 1045, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0697, "grad_norm": 1.4453125, "learning_rate": 1.5197428787077472e-06, "long_answer_loss": 0.0697, "loss": 0.0696, "short_answer_loss": NaN, "step": 1046, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0705, "grad_norm": 1.421875, "learning_rate": 1.5041156822198492e-06, "long_answer_loss": 0.0705, "loss": 0.0626, "short_answer_loss": NaN, "step": 1047, "template_loss": 0.0 }, { "epoch": 1.69, "full_loss": 0.0695, "grad_norm": 1.5, "learning_rate": 1.4885641021363541e-06, "long_answer_loss": 0.0695, "loss": 0.0708, "short_answer_loss": NaN, "step": 1048, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0676, "grad_norm": 1.453125, "learning_rate": 1.4730882454022362e-06, "long_answer_loss": 0.0676, "loss": 0.066, "short_answer_loss": NaN, "step": 1049, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0787, "grad_norm": 1.5546875, "learning_rate": 1.457688218441737e-06, "long_answer_loss": 0.0787, "loss": 0.0769, "short_answer_loss": NaN, "step": 1050, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0595, "grad_norm": 1.515625, "learning_rate": 1.442364127157632e-06, "long_answer_loss": 0.0595, "loss": 0.0766, "short_answer_loss": NaN, "step": 1051, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0641, "grad_norm": 1.453125, "learning_rate": 1.4271160769305014e-06, "long_answer_loss": 0.0641, "loss": 0.0722, "short_answer_loss": NaN, "step": 1052, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0668, "grad_norm": 1.359375, "learning_rate": 1.4119441726180085e-06, "long_answer_loss": 0.0668, "loss": 0.064, "short_answer_loss": NaN, "step": 1053, "template_loss": 0.0 }, { "epoch": 1.7, "full_loss": 0.0754, "grad_norm": 1.3203125, "learning_rate": 1.396848518554178e-06, "long_answer_loss": 0.0754, "loss": 0.0658, "short_answer_loss": NaN, "step": 1054, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0611, "grad_norm": 1.4609375, "learning_rate": 1.3818292185486749e-06, "long_answer_loss": 0.0611, "loss": 0.066, "short_answer_loss": NaN, "step": 1055, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0653, "grad_norm": 1.453125, "learning_rate": 1.366886375886095e-06, "long_answer_loss": 0.0653, "loss": 0.0735, "short_answer_loss": NaN, "step": 1056, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0558, "grad_norm": 1.5546875, "learning_rate": 1.3520200933252542e-06, "long_answer_loss": 0.0558, "loss": 0.0654, "short_answer_loss": NaN, "step": 1057, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0661, "grad_norm": 1.5703125, "learning_rate": 1.337230473098476e-06, "long_answer_loss": 0.0661, "loss": 0.07, "short_answer_loss": NaN, "step": 1058, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0689, "grad_norm": 1.609375, "learning_rate": 1.322517616910897e-06, "long_answer_loss": 0.0689, "loss": 0.0716, "short_answer_loss": NaN, "step": 1059, "template_loss": 0.0 }, { "epoch": 1.71, "full_loss": 0.0685, "grad_norm": 1.4296875, "learning_rate": 1.3078816259397635e-06, "long_answer_loss": 0.0685, "loss": 0.0685, "short_answer_loss": NaN, "step": 1060, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0617, "grad_norm": 1.4453125, "learning_rate": 1.2933226008337324e-06, "long_answer_loss": 0.0617, "loss": 0.066, "short_answer_loss": NaN, "step": 1061, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0637, "grad_norm": 1.4140625, "learning_rate": 1.2788406417121867e-06, "long_answer_loss": 0.0637, "loss": 0.0643, "short_answer_loss": NaN, "step": 1062, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0675, "grad_norm": 1.5, "learning_rate": 1.2644358481645399e-06, "long_answer_loss": 0.0675, "loss": 0.0733, "short_answer_loss": NaN, "step": 1063, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0769, "grad_norm": 1.3515625, "learning_rate": 1.2501083192495544e-06, "long_answer_loss": 0.0769, "loss": 0.0676, "short_answer_loss": NaN, "step": 1064, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0555, "grad_norm": 1.46875, "learning_rate": 1.2358581534946594e-06, "long_answer_loss": 0.0555, "loss": 0.0682, "short_answer_loss": NaN, "step": 1065, "template_loss": 0.0 }, { "epoch": 1.72, "full_loss": 0.0875, "grad_norm": 1.484375, "learning_rate": 1.2216854488952753e-06, "long_answer_loss": 0.0875, "loss": 0.0691, "short_answer_loss": NaN, "step": 1066, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0669, "grad_norm": 1.453125, "learning_rate": 1.2075903029141384e-06, "long_answer_loss": 0.0669, "loss": 0.065, "short_answer_loss": NaN, "step": 1067, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0809, "grad_norm": 1.4375, "learning_rate": 1.193572812480627e-06, "long_answer_loss": 0.0809, "loss": 0.0714, "short_answer_loss": NaN, "step": 1068, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0661, "grad_norm": 1.3515625, "learning_rate": 1.1796330739901024e-06, "long_answer_loss": 0.0661, "loss": 0.0676, "short_answer_loss": NaN, "step": 1069, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0572, "grad_norm": 1.484375, "learning_rate": 1.1657711833032394e-06, "long_answer_loss": 0.0572, "loss": 0.0651, "short_answer_loss": NaN, "step": 1070, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0708, "grad_norm": 1.5078125, "learning_rate": 1.1519872357453734e-06, "long_answer_loss": 0.0708, "loss": 0.0682, "short_answer_loss": NaN, "step": 1071, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0714, "grad_norm": 1.5390625, "learning_rate": 1.1382813261058349e-06, "long_answer_loss": 0.0714, "loss": 0.0686, "short_answer_loss": NaN, "step": 1072, "template_loss": 0.0 }, { "epoch": 1.73, "full_loss": 0.0728, "grad_norm": 1.375, "learning_rate": 1.124653548637311e-06, "long_answer_loss": 0.0728, "loss": 0.0677, "short_answer_loss": NaN, "step": 1073, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0528, "grad_norm": 1.4765625, "learning_rate": 1.111103997055185e-06, "long_answer_loss": 0.0528, "loss": 0.0657, "short_answer_loss": NaN, "step": 1074, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0611, "grad_norm": 1.375, "learning_rate": 1.0976327645368975e-06, "long_answer_loss": 0.0611, "loss": 0.0662, "short_answer_loss": NaN, "step": 1075, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0783, "grad_norm": 1.515625, "learning_rate": 1.0842399437213103e-06, "long_answer_loss": 0.0783, "loss": 0.0696, "short_answer_loss": NaN, "step": 1076, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0896, "grad_norm": 1.5859375, "learning_rate": 1.0709256267080566e-06, "long_answer_loss": 0.0896, "loss": 0.0732, "short_answer_loss": NaN, "step": 1077, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0789, "grad_norm": 1.4765625, "learning_rate": 1.0576899050569204e-06, "long_answer_loss": 0.0789, "loss": 0.0686, "short_answer_loss": NaN, "step": 1078, "template_loss": 0.0 }, { "epoch": 1.74, "full_loss": 0.0569, "grad_norm": 1.5, "learning_rate": 1.0445328697872015e-06, "long_answer_loss": 0.0569, "loss": 0.0667, "short_answer_loss": NaN, "step": 1079, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0676, "grad_norm": 1.5234375, "learning_rate": 1.0314546113770876e-06, "long_answer_loss": 0.0676, "loss": 0.0672, "short_answer_loss": NaN, "step": 1080, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0694, "grad_norm": 1.5546875, "learning_rate": 1.018455219763037e-06, "long_answer_loss": 0.0694, "loss": 0.0711, "short_answer_loss": NaN, "step": 1081, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0741, "grad_norm": 1.453125, "learning_rate": 1.0055347843391557e-06, "long_answer_loss": 0.0741, "loss": 0.066, "short_answer_loss": NaN, "step": 1082, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0622, "grad_norm": 1.4765625, "learning_rate": 9.92693393956584e-07, "long_answer_loss": 0.0622, "loss": 0.0711, "short_answer_loss": NaN, "step": 1083, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0568, "grad_norm": 1.4140625, "learning_rate": 9.79931136922889e-07, "long_answer_loss": 0.0568, "loss": 0.0685, "short_answer_loss": NaN, "step": 1084, "template_loss": 0.0 }, { "epoch": 1.75, "full_loss": 0.0558, "grad_norm": 1.3671875, "learning_rate": 9.672481010014486e-07, "long_answer_loss": 0.0558, "loss": 0.0639, "short_answer_loss": NaN, "step": 1085, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.0688, "grad_norm": 1.4609375, "learning_rate": 9.54644373410861e-07, "long_answer_loss": 0.0688, "loss": 0.0715, "short_answer_loss": NaN, "step": 1086, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.1051, "grad_norm": 1.546875, "learning_rate": 9.421200408243277e-07, "long_answer_loss": 0.1051, "loss": 0.0781, "short_answer_loss": NaN, "step": 1087, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.0607, "grad_norm": 1.421875, "learning_rate": 9.296751893690808e-07, "long_answer_loss": 0.0607, "loss": 0.0668, "short_answer_loss": NaN, "step": 1088, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.0595, "grad_norm": 1.4765625, "learning_rate": 9.173099046257655e-07, "long_answer_loss": 0.0595, "loss": 0.067, "short_answer_loss": NaN, "step": 1089, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.0611, "grad_norm": 1.53125, "learning_rate": 9.050242716278676e-07, "long_answer_loss": 0.0611, "loss": 0.0711, "short_answer_loss": NaN, "step": 1090, "template_loss": 0.0 }, { "epoch": 1.76, "full_loss": 0.0559, "grad_norm": 1.4765625, "learning_rate": 8.928183748611263e-07, "long_answer_loss": 0.0559, "loss": 0.0701, "short_answer_loss": NaN, "step": 1091, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.0727, "grad_norm": 1.3984375, "learning_rate": 8.806922982629473e-07, "long_answer_loss": 0.0727, "loss": 0.0642, "short_answer_loss": NaN, "step": 1092, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.0722, "grad_norm": 1.5390625, "learning_rate": 8.686461252218323e-07, "long_answer_loss": 0.0722, "loss": 0.0711, "short_answer_loss": NaN, "step": 1093, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.0591, "grad_norm": 1.3671875, "learning_rate": 8.566799385768015e-07, "long_answer_loss": 0.0591, "loss": 0.0672, "short_answer_loss": NaN, "step": 1094, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.0665, "grad_norm": 1.3828125, "learning_rate": 8.447938206168279e-07, "long_answer_loss": 0.0665, "loss": 0.0645, "short_answer_loss": NaN, "step": 1095, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.0767, "grad_norm": 1.4296875, "learning_rate": 8.329878530802665e-07, "long_answer_loss": 0.0767, "loss": 0.0677, "short_answer_loss": NaN, "step": 1096, "template_loss": 0.0 }, { "epoch": 1.77, "full_loss": 0.058, "grad_norm": 1.375, "learning_rate": 8.21262117154295e-07, "long_answer_loss": 0.058, "loss": 0.0703, "short_answer_loss": NaN, "step": 1097, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0668, "grad_norm": 1.4140625, "learning_rate": 8.096166934743549e-07, "long_answer_loss": 0.0668, "loss": 0.0685, "short_answer_loss": NaN, "step": 1098, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0738, "grad_norm": 1.625, "learning_rate": 7.980516621235973e-07, "long_answer_loss": 0.0738, "loss": 0.0747, "short_answer_loss": NaN, "step": 1099, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0578, "grad_norm": 1.421875, "learning_rate": 7.865671026323323e-07, "long_answer_loss": 0.0578, "loss": 0.0638, "short_answer_loss": NaN, "step": 1100, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0657, "grad_norm": 1.40625, "learning_rate": 7.751630939774823e-07, "long_answer_loss": 0.0657, "loss": 0.0627, "short_answer_loss": NaN, "step": 1101, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0713, "grad_norm": 1.453125, "learning_rate": 7.638397145820361e-07, "long_answer_loss": 0.0713, "loss": 0.0705, "short_answer_loss": NaN, "step": 1102, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0662, "grad_norm": 1.4921875, "learning_rate": 7.525970423145165e-07, "long_answer_loss": 0.0662, "loss": 0.0698, "short_answer_loss": NaN, "step": 1103, "template_loss": 0.0 }, { "epoch": 1.78, "full_loss": 0.0626, "grad_norm": 1.515625, "learning_rate": 7.414351544884332e-07, "long_answer_loss": 0.0626, "loss": 0.0699, "short_answer_loss": NaN, "step": 1104, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.0612, "grad_norm": 1.5546875, "learning_rate": 7.303541278617654e-07, "long_answer_loss": 0.0612, "loss": 0.0668, "short_answer_loss": NaN, "step": 1105, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.0803, "grad_norm": 1.46875, "learning_rate": 7.193540386364203e-07, "long_answer_loss": 0.0803, "loss": 0.0755, "short_answer_loss": NaN, "step": 1106, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.1016, "grad_norm": 1.4765625, "learning_rate": 7.084349624577213e-07, "long_answer_loss": 0.1016, "loss": 0.0779, "short_answer_loss": NaN, "step": 1107, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.0583, "grad_norm": 1.3828125, "learning_rate": 6.975969744138791e-07, "long_answer_loss": 0.0583, "loss": 0.0642, "short_answer_loss": NaN, "step": 1108, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.0548, "grad_norm": 1.4921875, "learning_rate": 6.868401490354767e-07, "long_answer_loss": 0.0548, "loss": 0.0696, "short_answer_loss": NaN, "step": 1109, "template_loss": 0.0 }, { "epoch": 1.79, "full_loss": 0.0653, "grad_norm": 1.53125, "learning_rate": 6.761645602949618e-07, "long_answer_loss": 0.0653, "loss": 0.0703, "short_answer_loss": NaN, "step": 1110, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0604, "grad_norm": 1.4140625, "learning_rate": 6.655702816061316e-07, "long_answer_loss": 0.0604, "loss": 0.0645, "short_answer_loss": NaN, "step": 1111, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0587, "grad_norm": 1.46875, "learning_rate": 6.55057385823632e-07, "long_answer_loss": 0.0587, "loss": 0.0689, "short_answer_loss": NaN, "step": 1112, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0708, "grad_norm": 1.3984375, "learning_rate": 6.446259452424566e-07, "long_answer_loss": 0.0708, "loss": 0.0687, "short_answer_loss": NaN, "step": 1113, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0642, "grad_norm": 1.5546875, "learning_rate": 6.342760315974485e-07, "long_answer_loss": 0.0642, "loss": 0.0705, "short_answer_loss": NaN, "step": 1114, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0649, "grad_norm": 1.46875, "learning_rate": 6.240077160628063e-07, "long_answer_loss": 0.0649, "loss": 0.0692, "short_answer_loss": NaN, "step": 1115, "template_loss": 0.0 }, { "epoch": 1.8, "full_loss": 0.0659, "grad_norm": 1.5859375, "learning_rate": 6.138210692515939e-07, "long_answer_loss": 0.0659, "loss": 0.0702, "short_answer_loss": NaN, "step": 1116, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0616, "grad_norm": 1.3984375, "learning_rate": 6.037161612152606e-07, "long_answer_loss": 0.0616, "loss": 0.0654, "short_answer_loss": NaN, "step": 1117, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0791, "grad_norm": 1.4140625, "learning_rate": 5.936930614431499e-07, "long_answer_loss": 0.0791, "loss": 0.0715, "short_answer_loss": NaN, "step": 1118, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0768, "grad_norm": 1.6328125, "learning_rate": 5.837518388620317e-07, "long_answer_loss": 0.0768, "loss": 0.0759, "short_answer_loss": NaN, "step": 1119, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0762, "grad_norm": 1.3515625, "learning_rate": 5.738925618356206e-07, "long_answer_loss": 0.0762, "loss": 0.0693, "short_answer_loss": NaN, "step": 1120, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0957, "grad_norm": 1.4140625, "learning_rate": 5.641152981641084e-07, "long_answer_loss": 0.0957, "loss": 0.0706, "short_answer_loss": NaN, "step": 1121, "template_loss": 0.0 }, { "epoch": 1.81, "full_loss": 0.0673, "grad_norm": 1.4296875, "learning_rate": 5.544201150837023e-07, "long_answer_loss": 0.0673, "loss": 0.0661, "short_answer_loss": NaN, "step": 1122, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.0617, "grad_norm": 1.4609375, "learning_rate": 5.448070792661533e-07, "long_answer_loss": 0.0617, "loss": 0.067, "short_answer_loss": NaN, "step": 1123, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.072, "grad_norm": 1.4140625, "learning_rate": 5.352762568183067e-07, "long_answer_loss": 0.072, "loss": 0.0719, "short_answer_loss": NaN, "step": 1124, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.0719, "grad_norm": 1.53125, "learning_rate": 5.258277132816388e-07, "long_answer_loss": 0.0719, "loss": 0.0691, "short_answer_loss": NaN, "step": 1125, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.0673, "grad_norm": 1.4375, "learning_rate": 5.164615136318163e-07, "long_answer_loss": 0.0673, "loss": 0.0663, "short_answer_loss": NaN, "step": 1126, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.0696, "grad_norm": 1.515625, "learning_rate": 5.071777222782417e-07, "long_answer_loss": 0.0696, "loss": 0.0656, "short_answer_loss": NaN, "step": 1127, "template_loss": 0.0 }, { "epoch": 1.82, "full_loss": 0.0713, "grad_norm": 1.328125, "learning_rate": 4.979764030636116e-07, "long_answer_loss": 0.0713, "loss": 0.0634, "short_answer_loss": NaN, "step": 1128, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0627, "grad_norm": 1.421875, "learning_rate": 4.888576192634817e-07, "long_answer_loss": 0.0627, "loss": 0.0661, "short_answer_loss": NaN, "step": 1129, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0769, "grad_norm": 1.515625, "learning_rate": 4.798214335858267e-07, "long_answer_loss": 0.0769, "loss": 0.066, "short_answer_loss": NaN, "step": 1130, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0609, "grad_norm": 1.71875, "learning_rate": 4.708679081706136e-07, "long_answer_loss": 0.0609, "loss": 0.0662, "short_answer_loss": NaN, "step": 1131, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0563, "grad_norm": 1.5234375, "learning_rate": 4.6199710458936644e-07, "long_answer_loss": 0.0563, "loss": 0.0627, "short_answer_loss": NaN, "step": 1132, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0636, "grad_norm": 1.4375, "learning_rate": 4.532090838447564e-07, "long_answer_loss": 0.0636, "loss": 0.0637, "short_answer_loss": NaN, "step": 1133, "template_loss": 0.0 }, { "epoch": 1.83, "full_loss": 0.0653, "grad_norm": 1.53125, "learning_rate": 4.4450390637016946e-07, "long_answer_loss": 0.0653, "loss": 0.0671, "short_answer_loss": NaN, "step": 1134, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0794, "grad_norm": 1.4375, "learning_rate": 4.358816320292947e-07, "long_answer_loss": 0.0794, "loss": 0.0685, "short_answer_loss": NaN, "step": 1135, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0536, "grad_norm": 1.453125, "learning_rate": 4.273423201157159e-07, "long_answer_loss": 0.0536, "loss": 0.0651, "short_answer_loss": NaN, "step": 1136, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0628, "grad_norm": 1.46875, "learning_rate": 4.1888602935250267e-07, "long_answer_loss": 0.0628, "loss": 0.0664, "short_answer_loss": NaN, "step": 1137, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0555, "grad_norm": 1.421875, "learning_rate": 4.105128178918033e-07, "long_answer_loss": 0.0555, "loss": 0.0693, "short_answer_loss": NaN, "step": 1138, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0888, "grad_norm": 1.4453125, "learning_rate": 4.022227433144468e-07, "long_answer_loss": 0.0888, "loss": 0.0665, "short_answer_loss": NaN, "step": 1139, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0606, "grad_norm": 1.4453125, "learning_rate": 3.940158626295501e-07, "long_answer_loss": 0.0606, "loss": 0.0662, "short_answer_loss": NaN, "step": 1140, "template_loss": 0.0 }, { "epoch": 1.84, "full_loss": 0.0557, "grad_norm": 1.4609375, "learning_rate": 3.858922322741182e-07, "long_answer_loss": 0.0557, "loss": 0.0664, "short_answer_loss": NaN, "step": 1141, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0565, "grad_norm": 1.390625, "learning_rate": 3.778519081126641e-07, "long_answer_loss": 0.0565, "loss": 0.0645, "short_answer_loss": NaN, "step": 1142, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0817, "grad_norm": 1.578125, "learning_rate": 3.698949454368231e-07, "long_answer_loss": 0.0817, "loss": 0.0759, "short_answer_loss": NaN, "step": 1143, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0748, "grad_norm": 1.421875, "learning_rate": 3.620213989649679e-07, "long_answer_loss": 0.0748, "loss": 0.0667, "short_answer_loss": NaN, "step": 1144, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0776, "grad_norm": 1.5234375, "learning_rate": 3.542313228418359e-07, "long_answer_loss": 0.0776, "loss": 0.0682, "short_answer_loss": NaN, "step": 1145, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0691, "grad_norm": 1.375, "learning_rate": 3.4652477063815833e-07, "long_answer_loss": 0.0691, "loss": 0.0669, "short_answer_loss": NaN, "step": 1146, "template_loss": 0.0 }, { "epoch": 1.85, "full_loss": 0.0666, "grad_norm": 1.5234375, "learning_rate": 3.3890179535028544e-07, "long_answer_loss": 0.0666, "loss": 0.0768, "short_answer_loss": NaN, "step": 1147, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.0666, "grad_norm": 1.359375, "learning_rate": 3.313624493998316e-07, "long_answer_loss": 0.0666, "loss": 0.0599, "short_answer_loss": NaN, "step": 1148, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.0711, "grad_norm": 1.453125, "learning_rate": 3.2390678463330713e-07, "long_answer_loss": 0.0711, "loss": 0.0667, "short_answer_loss": NaN, "step": 1149, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.0645, "grad_norm": 1.40625, "learning_rate": 3.165348523217634e-07, "long_answer_loss": 0.0645, "loss": 0.0642, "short_answer_loss": NaN, "step": 1150, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.0717, "grad_norm": 1.4296875, "learning_rate": 3.092467031604443e-07, "long_answer_loss": 0.0717, "loss": 0.0687, "short_answer_loss": NaN, "step": 1151, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.077, "grad_norm": 1.421875, "learning_rate": 3.0204238726842945e-07, "long_answer_loss": 0.077, "loss": 0.0694, "short_answer_loss": NaN, "step": 1152, "template_loss": 0.0 }, { "epoch": 1.86, "full_loss": 0.0727, "grad_norm": 1.4765625, "learning_rate": 2.9492195418829997e-07, "long_answer_loss": 0.0727, "loss": 0.0674, "short_answer_loss": NaN, "step": 1153, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0613, "grad_norm": 1.40625, "learning_rate": 2.878854528857888e-07, "long_answer_loss": 0.0613, "loss": 0.0721, "short_answer_loss": NaN, "step": 1154, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0614, "grad_norm": 1.375, "learning_rate": 2.8093293174944883e-07, "long_answer_loss": 0.0614, "loss": 0.0649, "short_answer_loss": NaN, "step": 1155, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0862, "grad_norm": 1.515625, "learning_rate": 2.740644385903199e-07, "long_answer_loss": 0.0862, "loss": 0.0664, "short_answer_loss": NaN, "step": 1156, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0653, "grad_norm": 1.4921875, "learning_rate": 2.672800206415971e-07, "long_answer_loss": 0.0653, "loss": 0.0659, "short_answer_loss": NaN, "step": 1157, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0598, "grad_norm": 1.4375, "learning_rate": 2.605797245583075e-07, "long_answer_loss": 0.0598, "loss": 0.0659, "short_answer_loss": NaN, "step": 1158, "template_loss": 0.0 }, { "epoch": 1.87, "full_loss": 0.0685, "grad_norm": 1.453125, "learning_rate": 2.5396359641699093e-07, "long_answer_loss": 0.0685, "loss": 0.0667, "short_answer_loss": NaN, "step": 1159, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.0711, "grad_norm": 1.4765625, "learning_rate": 2.474316817153821e-07, "long_answer_loss": 0.0711, "loss": 0.0694, "short_answer_loss": NaN, "step": 1160, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.0771, "grad_norm": 1.4296875, "learning_rate": 2.4098402537209577e-07, "long_answer_loss": 0.0771, "loss": 0.0703, "short_answer_loss": NaN, "step": 1161, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.0707, "grad_norm": 1.4375, "learning_rate": 2.3462067172632246e-07, "long_answer_loss": 0.0707, "loss": 0.065, "short_answer_loss": NaN, "step": 1162, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.072, "grad_norm": 1.4765625, "learning_rate": 2.2834166453751805e-07, "long_answer_loss": 0.072, "loss": 0.0671, "short_answer_loss": NaN, "step": 1163, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.0784, "grad_norm": 1.46875, "learning_rate": 2.2214704698510503e-07, "long_answer_loss": 0.0784, "loss": 0.0681, "short_answer_loss": NaN, "step": 1164, "template_loss": 0.0 }, { "epoch": 1.88, "full_loss": 0.0633, "grad_norm": 1.4140625, "learning_rate": 2.160368616681785e-07, "long_answer_loss": 0.0633, "loss": 0.062, "short_answer_loss": NaN, "step": 1165, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0567, "grad_norm": 1.4609375, "learning_rate": 2.1001115060520772e-07, "long_answer_loss": 0.0567, "loss": 0.0656, "short_answer_loss": NaN, "step": 1166, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0799, "grad_norm": 1.359375, "learning_rate": 2.04069955233753e-07, "long_answer_loss": 0.0799, "loss": 0.0628, "short_answer_loss": NaN, "step": 1167, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0707, "grad_norm": 1.5546875, "learning_rate": 1.9821331641017572e-07, "long_answer_loss": 0.0707, "loss": 0.07, "short_answer_loss": NaN, "step": 1168, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0638, "grad_norm": 1.375, "learning_rate": 1.9244127440936066e-07, "long_answer_loss": 0.0638, "loss": 0.0649, "short_answer_loss": NaN, "step": 1169, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0837, "grad_norm": 1.515625, "learning_rate": 1.8675386892443858e-07, "long_answer_loss": 0.0837, "loss": 0.0696, "short_answer_loss": NaN, "step": 1170, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0755, "grad_norm": 1.5390625, "learning_rate": 1.8115113906650856e-07, "long_answer_loss": 0.0755, "loss": 0.0708, "short_answer_loss": NaN, "step": 1171, "template_loss": 0.0 }, { "epoch": 1.89, "full_loss": 0.0655, "grad_norm": 1.4453125, "learning_rate": 1.7563312336437848e-07, "long_answer_loss": 0.0655, "loss": 0.0668, "short_answer_loss": NaN, "step": 1172, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0614, "grad_norm": 1.4296875, "learning_rate": 1.7019985976429174e-07, "long_answer_loss": 0.0614, "loss": 0.0653, "short_answer_loss": NaN, "step": 1173, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0657, "grad_norm": 1.4296875, "learning_rate": 1.6485138562966906e-07, "long_answer_loss": 0.0657, "loss": 0.0665, "short_answer_loss": NaN, "step": 1174, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0609, "grad_norm": 1.421875, "learning_rate": 1.5958773774085166e-07, "long_answer_loss": 0.0609, "loss": 0.0707, "short_answer_loss": NaN, "step": 1175, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0635, "grad_norm": 1.3515625, "learning_rate": 1.5440895229485026e-07, "long_answer_loss": 0.0635, "loss": 0.0637, "short_answer_loss": NaN, "step": 1176, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0547, "grad_norm": 1.5078125, "learning_rate": 1.493150649050923e-07, "long_answer_loss": 0.0547, "loss": 0.068, "short_answer_loss": NaN, "step": 1177, "template_loss": 0.0 }, { "epoch": 1.9, "full_loss": 0.0559, "grad_norm": 1.53125, "learning_rate": 1.4430611060117922e-07, "long_answer_loss": 0.0559, "loss": 0.0781, "short_answer_loss": NaN, "step": 1178, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.0682, "grad_norm": 1.4921875, "learning_rate": 1.3938212382864497e-07, "long_answer_loss": 0.0682, "loss": 0.0741, "short_answer_loss": NaN, "step": 1179, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.0589, "grad_norm": 1.453125, "learning_rate": 1.345431384487214e-07, "long_answer_loss": 0.0589, "loss": 0.0646, "short_answer_loss": NaN, "step": 1180, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.073, "grad_norm": 1.4296875, "learning_rate": 1.2978918773810243e-07, "long_answer_loss": 0.073, "loss": 0.0696, "short_answer_loss": NaN, "step": 1181, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.06, "grad_norm": 1.453125, "learning_rate": 1.251203043887164e-07, "long_answer_loss": 0.06, "loss": 0.0728, "short_answer_loss": NaN, "step": 1182, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.0635, "grad_norm": 1.515625, "learning_rate": 1.2053652050749846e-07, "long_answer_loss": 0.0635, "loss": 0.0673, "short_answer_loss": NaN, "step": 1183, "template_loss": 0.0 }, { "epoch": 1.91, "full_loss": 0.0641, "grad_norm": 1.5, "learning_rate": 1.160378676161783e-07, "long_answer_loss": 0.0641, "loss": 0.0723, "short_answer_loss": NaN, "step": 1184, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0841, "grad_norm": 1.453125, "learning_rate": 1.1162437665105108e-07, "long_answer_loss": 0.0841, "loss": 0.0718, "short_answer_loss": NaN, "step": 1185, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0664, "grad_norm": 1.390625, "learning_rate": 1.0729607796277629e-07, "long_answer_loss": 0.0664, "loss": 0.0669, "short_answer_loss": NaN, "step": 1186, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0783, "grad_norm": 1.3515625, "learning_rate": 1.0305300131616125e-07, "long_answer_loss": 0.0783, "loss": 0.0656, "short_answer_loss": NaN, "step": 1187, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0659, "grad_norm": 1.3671875, "learning_rate": 9.889517588995839e-08, "long_answer_loss": 0.0659, "loss": 0.0679, "short_answer_loss": NaN, "step": 1188, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0664, "grad_norm": 1.5234375, "learning_rate": 9.482263027666833e-08, "long_answer_loss": 0.0664, "loss": 0.0708, "short_answer_loss": NaN, "step": 1189, "template_loss": 0.0 }, { "epoch": 1.92, "full_loss": 0.0586, "grad_norm": 1.3359375, "learning_rate": 9.083539248233852e-08, "long_answer_loss": 0.0586, "loss": 0.0622, "short_answer_loss": NaN, "step": 1190, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0623, "grad_norm": 1.4453125, "learning_rate": 8.693348992637046e-08, "long_answer_loss": 0.0623, "loss": 0.0672, "short_answer_loss": NaN, "step": 1191, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0905, "grad_norm": 1.4921875, "learning_rate": 8.311694944133502e-08, "long_answer_loss": 0.0905, "loss": 0.0699, "short_answer_loss": NaN, "step": 1192, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0673, "grad_norm": 1.484375, "learning_rate": 7.938579727278517e-08, "long_answer_loss": 0.0673, "loss": 0.068, "short_answer_loss": NaN, "step": 1193, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0676, "grad_norm": 1.5078125, "learning_rate": 7.574005907907966e-08, "long_answer_loss": 0.0676, "loss": 0.0718, "short_answer_loss": NaN, "step": 1194, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0656, "grad_norm": 1.3984375, "learning_rate": 7.217975993119713e-08, "long_answer_loss": 0.0656, "loss": 0.0641, "short_answer_loss": NaN, "step": 1195, "template_loss": 0.0 }, { "epoch": 1.93, "full_loss": 0.0809, "grad_norm": 1.390625, "learning_rate": 6.87049243125723e-08, "long_answer_loss": 0.0809, "loss": 0.0661, "short_answer_loss": NaN, "step": 1196, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0564, "grad_norm": 1.484375, "learning_rate": 6.531557611892669e-08, "long_answer_loss": 0.0564, "loss": 0.0698, "short_answer_loss": NaN, "step": 1197, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0766, "grad_norm": 1.4140625, "learning_rate": 6.201173865810207e-08, "long_answer_loss": 0.0766, "loss": 0.0637, "short_answer_loss": NaN, "step": 1198, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0868, "grad_norm": 1.8046875, "learning_rate": 5.879343464989806e-08, "long_answer_loss": 0.0868, "loss": 0.0755, "short_answer_loss": NaN, "step": 1199, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0726, "grad_norm": 1.4453125, "learning_rate": 5.566068622592235e-08, "long_answer_loss": 0.0726, "loss": 0.0702, "short_answer_loss": NaN, "step": 1200, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0771, "grad_norm": 1.5, "learning_rate": 5.261351492943101e-08, "long_answer_loss": 0.0771, "loss": 0.0651, "short_answer_loss": NaN, "step": 1201, "template_loss": 0.0 }, { "epoch": 1.94, "full_loss": 0.0595, "grad_norm": 1.53125, "learning_rate": 4.965194171518833e-08, "long_answer_loss": 0.0595, "loss": 0.0685, "short_answer_loss": NaN, "step": 1202, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0644, "grad_norm": 1.4609375, "learning_rate": 4.677598694931285e-08, "long_answer_loss": 0.0644, "loss": 0.0731, "short_answer_loss": NaN, "step": 1203, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0676, "grad_norm": 1.453125, "learning_rate": 4.3985670409148196e-08, "long_answer_loss": 0.0676, "loss": 0.0671, "short_answer_loss": NaN, "step": 1204, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0634, "grad_norm": 1.390625, "learning_rate": 4.128101128312023e-08, "long_answer_loss": 0.0634, "loss": 0.071, "short_answer_loss": NaN, "step": 1205, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0868, "grad_norm": 1.484375, "learning_rate": 3.866202817060377e-08, "long_answer_loss": 0.0868, "loss": 0.0722, "short_answer_loss": NaN, "step": 1206, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0788, "grad_norm": 1.4453125, "learning_rate": 3.612873908180048e-08, "long_answer_loss": 0.0788, "loss": 0.0697, "short_answer_loss": NaN, "step": 1207, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.073, "grad_norm": 1.609375, "learning_rate": 3.3681161437612575e-08, "long_answer_loss": 0.073, "loss": 0.0694, "short_answer_loss": NaN, "step": 1208, "template_loss": 0.0 }, { "epoch": 1.95, "full_loss": 0.0587, "grad_norm": 1.421875, "learning_rate": 3.131931206951933e-08, "long_answer_loss": 0.0587, "loss": 0.064, "short_answer_loss": NaN, "step": 1209, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.053, "grad_norm": 1.5390625, "learning_rate": 2.9043207219468795e-08, "long_answer_loss": 0.053, "loss": 0.0679, "short_answer_loss": NaN, "step": 1210, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.0482, "grad_norm": 1.5625, "learning_rate": 2.6852862539757106e-08, "long_answer_loss": 0.0482, "loss": 0.0711, "short_answer_loss": NaN, "step": 1211, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.0736, "grad_norm": 1.40625, "learning_rate": 2.4748293092931308e-08, "long_answer_loss": 0.0736, "loss": 0.067, "short_answer_loss": NaN, "step": 1212, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.0611, "grad_norm": 1.46875, "learning_rate": 2.2729513351672783e-08, "long_answer_loss": 0.0611, "loss": 0.0718, "short_answer_loss": NaN, "step": 1213, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.0569, "grad_norm": 1.4609375, "learning_rate": 2.0796537198712608e-08, "long_answer_loss": 0.0569, "loss": 0.0694, "short_answer_loss": NaN, "step": 1214, "template_loss": 0.0 }, { "epoch": 1.96, "full_loss": 0.0782, "grad_norm": 1.4375, "learning_rate": 1.894937792672191e-08, "long_answer_loss": 0.0782, "loss": 0.0714, "short_answer_loss": NaN, "step": 1215, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.0704, "grad_norm": 1.515625, "learning_rate": 1.7188048238232778e-08, "long_answer_loss": 0.0704, "loss": 0.0732, "short_answer_loss": NaN, "step": 1216, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.0611, "grad_norm": 1.40625, "learning_rate": 1.5512560245541097e-08, "long_answer_loss": 0.0611, "loss": 0.0676, "short_answer_loss": NaN, "step": 1217, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.062, "grad_norm": 1.5, "learning_rate": 1.3922925470627458e-08, "long_answer_loss": 0.062, "loss": 0.0705, "short_answer_loss": NaN, "step": 1218, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.0516, "grad_norm": 1.4375, "learning_rate": 1.2419154845079439e-08, "long_answer_loss": 0.0516, "loss": 0.0623, "short_answer_loss": NaN, "step": 1219, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.0778, "grad_norm": 1.4765625, "learning_rate": 1.1001258710015283e-08, "long_answer_loss": 0.0778, "loss": 0.0637, "short_answer_loss": NaN, "step": 1220, "template_loss": 0.0 }, { "epoch": 1.97, "full_loss": 0.0571, "grad_norm": 1.484375, "learning_rate": 9.669246816010335e-09, "long_answer_loss": 0.0571, "loss": 0.0676, "short_answer_loss": NaN, "step": 1221, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.0721, "grad_norm": 1.4140625, "learning_rate": 8.423128323033213e-09, "long_answer_loss": 0.0721, "loss": 0.0687, "short_answer_loss": NaN, "step": 1222, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.0665, "grad_norm": 1.515625, "learning_rate": 7.262911800379191e-09, "long_answer_loss": 0.0665, "loss": 0.073, "short_answer_loss": NaN, "step": 1223, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.0723, "grad_norm": 1.5078125, "learning_rate": 6.188605226618849e-09, "long_answer_loss": 0.0723, "loss": 0.0722, "short_answer_loss": NaN, "step": 1224, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.0667, "grad_norm": 1.390625, "learning_rate": 5.200215989531465e-09, "long_answer_loss": 0.0667, "loss": 0.0653, "short_answer_loss": NaN, "step": 1225, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.061, "grad_norm": 1.421875, "learning_rate": 4.297750886064766e-09, "long_answer_loss": 0.061, "loss": 0.0637, "short_answer_loss": NaN, "step": 1226, "template_loss": 0.0 }, { "epoch": 1.98, "full_loss": 0.0719, "grad_norm": 1.484375, "learning_rate": 3.481216122284969e-09, "long_answer_loss": 0.0719, "loss": 0.0714, "short_answer_loss": NaN, "step": 1227, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0724, "grad_norm": 1.421875, "learning_rate": 2.7506173133282075e-09, "long_answer_loss": 0.0724, "loss": 0.0656, "short_answer_loss": NaN, "step": 1228, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0666, "grad_norm": 1.515625, "learning_rate": 2.105959483371389e-09, "long_answer_loss": 0.0666, "loss": 0.0666, "short_answer_loss": NaN, "step": 1229, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0949, "grad_norm": 1.4375, "learning_rate": 1.547247065593338e-09, "long_answer_loss": 0.0949, "loss": 0.0695, "short_answer_loss": NaN, "step": 1230, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0807, "grad_norm": 1.4296875, "learning_rate": 1.0744839021428755e-09, "long_answer_loss": 0.0807, "loss": 0.0709, "short_answer_loss": NaN, "step": 1231, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0711, "grad_norm": 1.5, "learning_rate": 6.876732441110645e-10, "long_answer_loss": 0.0711, "loss": 0.0677, "short_answer_loss": NaN, "step": 1232, "template_loss": 0.0 }, { "epoch": 1.99, "full_loss": 0.0797, "grad_norm": 1.46875, "learning_rate": 3.868177515173321e-10, "long_answer_loss": 0.0797, "loss": 0.0694, "short_answer_loss": NaN, "step": 1233, "template_loss": 0.0 }, { "epoch": 2.0, "full_loss": 0.0711, "grad_norm": 1.4140625, "learning_rate": 1.7191949328032587e-10, "long_answer_loss": 0.0711, "loss": 0.0705, "short_answer_loss": NaN, "step": 1234, "template_loss": 0.0 }, { "epoch": 2.0, "full_loss": 0.0733, "grad_norm": 1.375, "learning_rate": 4.297994721097487e-11, "long_answer_loss": 0.0733, "loss": 0.0645, "short_answer_loss": NaN, "step": 1235, "template_loss": 0.0 }, { "epoch": 2.0, "full_loss": 0.0661, "grad_norm": 1.65625, "learning_rate": 0.0, "long_answer_loss": 0.0661, "loss": 0.0717, "short_answer_loss": NaN, "step": 1236, "template_loss": 0.0 }, { "epoch": 2.0, "step": 1236, "total_flos": 9.29924110325121e+17, "train_loss": 0.12603917728482616, "train_runtime": 5247.2889, "train_samples_per_second": 30.175, "train_steps_per_second": 0.236 } ], "logging_steps": 1.0, "max_steps": 1236, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1.0, "total_flos": 9.29924110325121e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }