diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28806 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9996178100515958, + "eval_steps": 25.0, + "global_step": 2616, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "full_loss": 0.609, + "grad_norm": 11.4375, + "learning_rate": 3.1645569620253163e-07, + "long_answer_loss": 0.609, + "loss": 0.5548, + "short_answer_loss": NaN, + "step": 1, + "template_loss": 0.0 + }, + { + "epoch": 0.0, + "full_loss": 0.6229, + "grad_norm": 11.5, + "learning_rate": 6.329113924050633e-07, + "long_answer_loss": 0.6229, + "loss": 0.5756, + "short_answer_loss": NaN, + "step": 2, + "template_loss": 0.0 + }, + { + "epoch": 0.0, + "full_loss": 0.5579, + "grad_norm": 11.6875, + "learning_rate": 9.493670886075951e-07, + "long_answer_loss": 0.5579, + "loss": 0.5652, + "short_answer_loss": NaN, + "step": 3, + "template_loss": 0.0 + }, + { + "epoch": 0.0, + "full_loss": 0.6103, + "grad_norm": 11.625, + "learning_rate": 1.2658227848101265e-06, + "long_answer_loss": 0.6103, + "loss": 0.578, + "short_answer_loss": NaN, + "step": 4, + "template_loss": 0.0 + }, + { + "epoch": 0.0, + "full_loss": 0.5432, + "grad_norm": 11.0, + "learning_rate": 1.5822784810126583e-06, + "long_answer_loss": 0.5432, + "loss": 0.555, + "short_answer_loss": NaN, + "step": 5, + "template_loss": 0.0 + }, + { + "epoch": 0.0, + "full_loss": 0.5516, + "grad_norm": 10.5, + "learning_rate": 1.8987341772151901e-06, + "long_answer_loss": 0.5516, + "loss": 0.5425, + "short_answer_loss": NaN, + "step": 6, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.5406, + "grad_norm": 10.1875, + "learning_rate": 2.2151898734177215e-06, + "long_answer_loss": 0.5406, + "loss": 0.5275, + "short_answer_loss": NaN, + "step": 7, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.516, + "grad_norm": 9.375, + "learning_rate": 2.531645569620253e-06, + "long_answer_loss": 0.516, + "loss": 0.5037, + "short_answer_loss": NaN, + "step": 8, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.439, + "grad_norm": 7.9375, + "learning_rate": 2.848101265822785e-06, + "long_answer_loss": 0.439, + "loss": 0.4711, + "short_answer_loss": NaN, + "step": 9, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3844, + "grad_norm": 6.4375, + "learning_rate": 3.1645569620253167e-06, + "long_answer_loss": 0.3844, + "loss": 0.4277, + "short_answer_loss": NaN, + "step": 10, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3622, + "grad_norm": 5.875, + "learning_rate": 3.4810126582278482e-06, + "long_answer_loss": 0.3622, + "loss": 0.3967, + "short_answer_loss": NaN, + "step": 11, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3716, + "grad_norm": 4.96875, + "learning_rate": 3.7974683544303802e-06, + "long_answer_loss": 0.3716, + "loss": 0.3841, + "short_answer_loss": NaN, + "step": 12, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3691, + "grad_norm": 5.03125, + "learning_rate": 4.113924050632911e-06, + "long_answer_loss": 0.3691, + "loss": 0.3762, + "short_answer_loss": NaN, + "step": 13, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3616, + "grad_norm": 4.625, + "learning_rate": 4.430379746835443e-06, + "long_answer_loss": 0.3616, + "loss": 0.3497, + "short_answer_loss": NaN, + "step": 14, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.3443, + "grad_norm": 5.0, + "learning_rate": 4.746835443037975e-06, + "long_answer_loss": 0.3443, + "loss": 0.3311, + "short_answer_loss": NaN, + "step": 15, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.2819, + "grad_norm": 4.625, + "learning_rate": 5.063291139240506e-06, + "long_answer_loss": 0.2819, + "loss": 0.3011, + "short_answer_loss": NaN, + "step": 16, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.2899, + "grad_norm": 4.59375, + "learning_rate": 5.379746835443038e-06, + "long_answer_loss": 0.2899, + "loss": 0.2938, + "short_answer_loss": NaN, + "step": 17, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.258, + "grad_norm": 4.0625, + "learning_rate": 5.69620253164557e-06, + "long_answer_loss": 0.258, + "loss": 0.2691, + "short_answer_loss": NaN, + "step": 18, + "template_loss": 0.0 + }, + { + "epoch": 0.01, + "full_loss": 0.2726, + "grad_norm": 3.6875, + "learning_rate": 6.012658227848101e-06, + "long_answer_loss": 0.2726, + "loss": 0.2777, + "short_answer_loss": NaN, + "step": 19, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.284, + "grad_norm": 2.953125, + "learning_rate": 6.329113924050633e-06, + "long_answer_loss": 0.284, + "loss": 0.2524, + "short_answer_loss": NaN, + "step": 20, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2594, + "grad_norm": 2.765625, + "learning_rate": 6.6455696202531645e-06, + "long_answer_loss": 0.2594, + "loss": 0.2483, + "short_answer_loss": NaN, + "step": 21, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2399, + "grad_norm": 2.703125, + "learning_rate": 6.9620253164556965e-06, + "long_answer_loss": 0.2399, + "loss": 0.2333, + "short_answer_loss": NaN, + "step": 22, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2543, + "grad_norm": 2.75, + "learning_rate": 7.2784810126582285e-06, + "long_answer_loss": 0.2543, + "loss": 0.2374, + "short_answer_loss": NaN, + "step": 23, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2499, + "grad_norm": 2.640625, + "learning_rate": 7.5949367088607605e-06, + "long_answer_loss": 0.2499, + "loss": 0.2296, + "short_answer_loss": NaN, + "step": 24, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2313, + "grad_norm": 2.96875, + "learning_rate": 7.911392405063292e-06, + "long_answer_loss": 0.2313, + "loss": 0.2251, + "short_answer_loss": NaN, + "step": 25, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2233, + "grad_norm": 2.75, + "learning_rate": 8.227848101265822e-06, + "long_answer_loss": 0.2233, + "loss": 0.2148, + "short_answer_loss": NaN, + "step": 26, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2203, + "grad_norm": 2.828125, + "learning_rate": 8.544303797468354e-06, + "long_answer_loss": 0.2203, + "loss": 0.2176, + "short_answer_loss": NaN, + "step": 27, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.1993, + "grad_norm": 2.578125, + "learning_rate": 8.860759493670886e-06, + "long_answer_loss": 0.1993, + "loss": 0.2042, + "short_answer_loss": NaN, + "step": 28, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.197, + "grad_norm": 2.578125, + "learning_rate": 9.177215189873418e-06, + "long_answer_loss": 0.197, + "loss": 0.2116, + "short_answer_loss": NaN, + "step": 29, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.186, + "grad_norm": 2.953125, + "learning_rate": 9.49367088607595e-06, + "long_answer_loss": 0.186, + "loss": 0.2126, + "short_answer_loss": NaN, + "step": 30, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2096, + "grad_norm": 3.09375, + "learning_rate": 9.81012658227848e-06, + "long_answer_loss": 0.2096, + "loss": 0.1996, + "short_answer_loss": NaN, + "step": 31, + "template_loss": 0.0 + }, + { + "epoch": 0.02, + "full_loss": 0.2196, + "grad_norm": 2.796875, + "learning_rate": 1.0126582278481012e-05, + "long_answer_loss": 0.2196, + "loss": 0.2065, + "short_answer_loss": NaN, + "step": 32, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1981, + "grad_norm": 2.78125, + "learning_rate": 1.0443037974683544e-05, + "long_answer_loss": 0.1981, + "loss": 0.1855, + "short_answer_loss": NaN, + "step": 33, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.2032, + "grad_norm": 2.546875, + "learning_rate": 1.0759493670886076e-05, + "long_answer_loss": 0.2032, + "loss": 0.1861, + "short_answer_loss": NaN, + "step": 34, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1814, + "grad_norm": 2.6875, + "learning_rate": 1.1075949367088608e-05, + "long_answer_loss": 0.1814, + "loss": 0.1831, + "short_answer_loss": NaN, + "step": 35, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1701, + "grad_norm": 2.421875, + "learning_rate": 1.139240506329114e-05, + "long_answer_loss": 0.1701, + "loss": 0.1845, + "short_answer_loss": NaN, + "step": 36, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1923, + "grad_norm": 2.625, + "learning_rate": 1.170886075949367e-05, + "long_answer_loss": 0.1923, + "loss": 0.1795, + "short_answer_loss": NaN, + "step": 37, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1675, + "grad_norm": 2.421875, + "learning_rate": 1.2025316455696203e-05, + "long_answer_loss": 0.1675, + "loss": 0.1782, + "short_answer_loss": NaN, + "step": 38, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1975, + "grad_norm": 2.46875, + "learning_rate": 1.2341772151898735e-05, + "long_answer_loss": 0.1975, + "loss": 0.1831, + "short_answer_loss": NaN, + "step": 39, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.2041, + "grad_norm": 2.90625, + "learning_rate": 1.2658227848101267e-05, + "long_answer_loss": 0.2041, + "loss": 0.1855, + "short_answer_loss": NaN, + "step": 40, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1467, + "grad_norm": 2.578125, + "learning_rate": 1.2974683544303799e-05, + "long_answer_loss": 0.1467, + "loss": 0.1809, + "short_answer_loss": NaN, + "step": 41, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1754, + "grad_norm": 2.34375, + "learning_rate": 1.3291139240506329e-05, + "long_answer_loss": 0.1754, + "loss": 0.1787, + "short_answer_loss": NaN, + "step": 42, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1669, + "grad_norm": 2.6875, + "learning_rate": 1.3607594936708861e-05, + "long_answer_loss": 0.1669, + "loss": 0.1696, + "short_answer_loss": NaN, + "step": 43, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1715, + "grad_norm": 2.875, + "learning_rate": 1.3924050632911393e-05, + "long_answer_loss": 0.1715, + "loss": 0.1773, + "short_answer_loss": NaN, + "step": 44, + "template_loss": 0.0 + }, + { + "epoch": 0.03, + "full_loss": 0.1737, + "grad_norm": 2.640625, + "learning_rate": 1.4240506329113925e-05, + "long_answer_loss": 0.1737, + "loss": 0.1789, + "short_answer_loss": NaN, + "step": 45, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1641, + "grad_norm": 2.796875, + "learning_rate": 1.4556962025316457e-05, + "long_answer_loss": 0.1641, + "loss": 0.1756, + "short_answer_loss": NaN, + "step": 46, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1743, + "grad_norm": 2.515625, + "learning_rate": 1.4873417721518987e-05, + "long_answer_loss": 0.1743, + "loss": 0.1728, + "short_answer_loss": NaN, + "step": 47, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1921, + "grad_norm": 2.515625, + "learning_rate": 1.5189873417721521e-05, + "long_answer_loss": 0.1921, + "loss": 0.1766, + "short_answer_loss": NaN, + "step": 48, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.2013, + "grad_norm": 2.5, + "learning_rate": 1.550632911392405e-05, + "long_answer_loss": 0.2013, + "loss": 0.1804, + "short_answer_loss": NaN, + "step": 49, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1937, + "grad_norm": 2.703125, + "learning_rate": 1.5822784810126583e-05, + "long_answer_loss": 0.1937, + "loss": 0.1868, + "short_answer_loss": NaN, + "step": 50, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1877, + "grad_norm": 2.75, + "learning_rate": 1.6139240506329115e-05, + "long_answer_loss": 0.1877, + "loss": 0.1723, + "short_answer_loss": NaN, + "step": 51, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1664, + "grad_norm": 2.640625, + "learning_rate": 1.6455696202531644e-05, + "long_answer_loss": 0.1664, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 52, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1944, + "grad_norm": 2.625, + "learning_rate": 1.677215189873418e-05, + "long_answer_loss": 0.1944, + "loss": 0.1713, + "short_answer_loss": NaN, + "step": 53, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1679, + "grad_norm": 2.640625, + "learning_rate": 1.7088607594936708e-05, + "long_answer_loss": 0.1679, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 54, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1657, + "grad_norm": 2.5625, + "learning_rate": 1.7405063291139243e-05, + "long_answer_loss": 0.1657, + "loss": 0.1642, + "short_answer_loss": NaN, + "step": 55, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.177, + "grad_norm": 2.609375, + "learning_rate": 1.7721518987341772e-05, + "long_answer_loss": 0.177, + "loss": 0.1739, + "short_answer_loss": NaN, + "step": 56, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1579, + "grad_norm": 2.703125, + "learning_rate": 1.8037974683544304e-05, + "long_answer_loss": 0.1579, + "loss": 0.1722, + "short_answer_loss": NaN, + "step": 57, + "template_loss": 0.0 + }, + { + "epoch": 0.04, + "full_loss": 0.1785, + "grad_norm": 2.46875, + "learning_rate": 1.8354430379746836e-05, + "long_answer_loss": 0.1785, + "loss": 0.1668, + "short_answer_loss": NaN, + "step": 58, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1544, + "grad_norm": 2.734375, + "learning_rate": 1.8670886075949368e-05, + "long_answer_loss": 0.1544, + "loss": 0.173, + "short_answer_loss": NaN, + "step": 59, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1487, + "grad_norm": 2.859375, + "learning_rate": 1.89873417721519e-05, + "long_answer_loss": 0.1487, + "loss": 0.1755, + "short_answer_loss": NaN, + "step": 60, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1978, + "grad_norm": 2.75, + "learning_rate": 1.9303797468354432e-05, + "long_answer_loss": 0.1978, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 61, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1614, + "grad_norm": 2.75, + "learning_rate": 1.962025316455696e-05, + "long_answer_loss": 0.1614, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 62, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1567, + "grad_norm": 2.5, + "learning_rate": 1.9936708860759496e-05, + "long_answer_loss": 0.1567, + "loss": 0.1687, + "short_answer_loss": NaN, + "step": 63, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1606, + "grad_norm": 2.453125, + "learning_rate": 2.0253164556962025e-05, + "long_answer_loss": 0.1606, + "loss": 0.1682, + "short_answer_loss": NaN, + "step": 64, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1524, + "grad_norm": 2.734375, + "learning_rate": 2.056962025316456e-05, + "long_answer_loss": 0.1524, + "loss": 0.1682, + "short_answer_loss": NaN, + "step": 65, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1714, + "grad_norm": 2.765625, + "learning_rate": 2.088607594936709e-05, + "long_answer_loss": 0.1714, + "loss": 0.1698, + "short_answer_loss": NaN, + "step": 66, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1756, + "grad_norm": 2.859375, + "learning_rate": 2.120253164556962e-05, + "long_answer_loss": 0.1756, + "loss": 0.1721, + "short_answer_loss": NaN, + "step": 67, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1587, + "grad_norm": 2.671875, + "learning_rate": 2.1518987341772153e-05, + "long_answer_loss": 0.1587, + "loss": 0.1768, + "short_answer_loss": NaN, + "step": 68, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1785, + "grad_norm": 3.3125, + "learning_rate": 2.1835443037974685e-05, + "long_answer_loss": 0.1785, + "loss": 0.1778, + "short_answer_loss": NaN, + "step": 69, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1619, + "grad_norm": 2.71875, + "learning_rate": 2.2151898734177217e-05, + "long_answer_loss": 0.1619, + "loss": 0.1693, + "short_answer_loss": NaN, + "step": 70, + "template_loss": 0.0 + }, + { + "epoch": 0.05, + "full_loss": 0.1524, + "grad_norm": 2.796875, + "learning_rate": 2.246835443037975e-05, + "long_answer_loss": 0.1524, + "loss": 0.1772, + "short_answer_loss": NaN, + "step": 71, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1541, + "grad_norm": 2.59375, + "learning_rate": 2.278481012658228e-05, + "long_answer_loss": 0.1541, + "loss": 0.1693, + "short_answer_loss": NaN, + "step": 72, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1781, + "grad_norm": 2.984375, + "learning_rate": 2.3101265822784813e-05, + "long_answer_loss": 0.1781, + "loss": 0.1724, + "short_answer_loss": NaN, + "step": 73, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1583, + "grad_norm": 2.59375, + "learning_rate": 2.341772151898734e-05, + "long_answer_loss": 0.1583, + "loss": 0.1792, + "short_answer_loss": NaN, + "step": 74, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1577, + "grad_norm": 2.671875, + "learning_rate": 2.3734177215189873e-05, + "long_answer_loss": 0.1577, + "loss": 0.171, + "short_answer_loss": NaN, + "step": 75, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1753, + "grad_norm": 2.515625, + "learning_rate": 2.4050632911392405e-05, + "long_answer_loss": 0.1753, + "loss": 0.1788, + "short_answer_loss": NaN, + "step": 76, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1595, + "grad_norm": 2.859375, + "learning_rate": 2.4367088607594937e-05, + "long_answer_loss": 0.1595, + "loss": 0.1776, + "short_answer_loss": NaN, + "step": 77, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.2065, + "grad_norm": 2.578125, + "learning_rate": 2.468354430379747e-05, + "long_answer_loss": 0.2065, + "loss": 0.1759, + "short_answer_loss": NaN, + "step": 78, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1924, + "grad_norm": 2.671875, + "learning_rate": 2.5e-05, + "long_answer_loss": 0.1924, + "loss": 0.1815, + "short_answer_loss": NaN, + "step": 79, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.17, + "grad_norm": 2.5625, + "learning_rate": 2.4999990416177256e-05, + "long_answer_loss": 0.17, + "loss": 0.1732, + "short_answer_loss": NaN, + "step": 80, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1657, + "grad_norm": 2.359375, + "learning_rate": 2.4999961664723716e-05, + "long_answer_loss": 0.1657, + "loss": 0.1737, + "short_answer_loss": NaN, + "step": 81, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1976, + "grad_norm": 2.796875, + "learning_rate": 2.4999913745683463e-05, + "long_answer_loss": 0.1976, + "loss": 0.1828, + "short_answer_loss": NaN, + "step": 82, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1684, + "grad_norm": 2.59375, + "learning_rate": 2.4999846659129984e-05, + "long_answer_loss": 0.1684, + "loss": 0.1828, + "short_answer_loss": NaN, + "step": 83, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1616, + "grad_norm": 2.484375, + "learning_rate": 2.4999760405166147e-05, + "long_answer_loss": 0.1616, + "loss": 0.1823, + "short_answer_loss": NaN, + "step": 84, + "template_loss": 0.0 + }, + { + "epoch": 0.06, + "full_loss": 0.1942, + "grad_norm": 2.734375, + "learning_rate": 2.4999654983924213e-05, + "long_answer_loss": 0.1942, + "loss": 0.1864, + "short_answer_loss": NaN, + "step": 85, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1786, + "grad_norm": 2.59375, + "learning_rate": 2.499953039556584e-05, + "long_answer_loss": 0.1786, + "loss": 0.1769, + "short_answer_loss": NaN, + "step": 86, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1749, + "grad_norm": 2.296875, + "learning_rate": 2.4999386640282073e-05, + "long_answer_loss": 0.1749, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 87, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1575, + "grad_norm": 2.34375, + "learning_rate": 2.4999223718293347e-05, + "long_answer_loss": 0.1575, + "loss": 0.1861, + "short_answer_loss": NaN, + "step": 88, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1993, + "grad_norm": 2.421875, + "learning_rate": 2.4999041629849486e-05, + "long_answer_loss": 0.1993, + "loss": 0.184, + "short_answer_loss": NaN, + "step": 89, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1679, + "grad_norm": 2.296875, + "learning_rate": 2.4998840375229712e-05, + "long_answer_loss": 0.1679, + "loss": 0.1804, + "short_answer_loss": NaN, + "step": 90, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1924, + "grad_norm": 2.28125, + "learning_rate": 2.4998619954742626e-05, + "long_answer_loss": 0.1924, + "loss": 0.1807, + "short_answer_loss": NaN, + "step": 91, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1604, + "grad_norm": 2.140625, + "learning_rate": 2.4998380368726225e-05, + "long_answer_loss": 0.1604, + "loss": 0.1798, + "short_answer_loss": NaN, + "step": 92, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1709, + "grad_norm": 2.359375, + "learning_rate": 2.4998121617547894e-05, + "long_answer_loss": 0.1709, + "loss": 0.1863, + "short_answer_loss": NaN, + "step": 93, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.2032, + "grad_norm": 2.421875, + "learning_rate": 2.4997843701604404e-05, + "long_answer_loss": 0.2032, + "loss": 0.1787, + "short_answer_loss": NaN, + "step": 94, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1891, + "grad_norm": 2.25, + "learning_rate": 2.4997546621321914e-05, + "long_answer_loss": 0.1891, + "loss": 0.1738, + "short_answer_loss": NaN, + "step": 95, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1959, + "grad_norm": 2.71875, + "learning_rate": 2.4997230377155972e-05, + "long_answer_loss": 0.1959, + "loss": 0.1872, + "short_answer_loss": NaN, + "step": 96, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1873, + "grad_norm": 2.484375, + "learning_rate": 2.499689496959151e-05, + "long_answer_loss": 0.1873, + "loss": 0.1889, + "short_answer_loss": NaN, + "step": 97, + "template_loss": 0.0 + }, + { + "epoch": 0.07, + "full_loss": 0.1609, + "grad_norm": 2.1875, + "learning_rate": 2.499654039914285e-05, + "long_answer_loss": 0.1609, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 98, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.172, + "grad_norm": 2.6875, + "learning_rate": 2.499616666635368e-05, + "long_answer_loss": 0.172, + "loss": 0.1797, + "short_answer_loss": NaN, + "step": 99, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1783, + "grad_norm": 2.234375, + "learning_rate": 2.4995773771797104e-05, + "long_answer_loss": 0.1783, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 100, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1639, + "grad_norm": 2.171875, + "learning_rate": 2.4995361716075583e-05, + "long_answer_loss": 0.1639, + "loss": 0.1703, + "short_answer_loss": NaN, + "step": 101, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1762, + "grad_norm": 2.4375, + "learning_rate": 2.4994930499820965e-05, + "long_answer_loss": 0.1762, + "loss": 0.1865, + "short_answer_loss": NaN, + "step": 102, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1901, + "grad_norm": 2.390625, + "learning_rate": 2.4994480123694486e-05, + "long_answer_loss": 0.1901, + "loss": 0.1906, + "short_answer_loss": NaN, + "step": 103, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1734, + "grad_norm": 2.5, + "learning_rate": 2.4994010588386757e-05, + "long_answer_loss": 0.1734, + "loss": 0.1868, + "short_answer_loss": NaN, + "step": 104, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1668, + "grad_norm": 2.8125, + "learning_rate": 2.4993521894617772e-05, + "long_answer_loss": 0.1668, + "loss": 0.1804, + "short_answer_loss": NaN, + "step": 105, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1931, + "grad_norm": 2.390625, + "learning_rate": 2.49930140431369e-05, + "long_answer_loss": 0.1931, + "loss": 0.1893, + "short_answer_loss": NaN, + "step": 106, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1763, + "grad_norm": 2.359375, + "learning_rate": 2.4992487034722875e-05, + "long_answer_loss": 0.1763, + "loss": 0.187, + "short_answer_loss": NaN, + "step": 107, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.2037, + "grad_norm": 2.390625, + "learning_rate": 2.499194087018383e-05, + "long_answer_loss": 0.2037, + "loss": 0.1968, + "short_answer_loss": NaN, + "step": 108, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1873, + "grad_norm": 2.171875, + "learning_rate": 2.4991375550357253e-05, + "long_answer_loss": 0.1873, + "loss": 0.1832, + "short_answer_loss": NaN, + "step": 109, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.2097, + "grad_norm": 2.265625, + "learning_rate": 2.499079107611002e-05, + "long_answer_loss": 0.2097, + "loss": 0.1836, + "short_answer_loss": NaN, + "step": 110, + "template_loss": 0.0 + }, + { + "epoch": 0.08, + "full_loss": 0.1797, + "grad_norm": 2.265625, + "learning_rate": 2.4990187448338365e-05, + "long_answer_loss": 0.1797, + "loss": 0.1818, + "short_answer_loss": NaN, + "step": 111, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1816, + "grad_norm": 2.3125, + "learning_rate": 2.4989564667967902e-05, + "long_answer_loss": 0.1816, + "loss": 0.188, + "short_answer_loss": NaN, + "step": 112, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1719, + "grad_norm": 2.140625, + "learning_rate": 2.4988922735953603e-05, + "long_answer_loss": 0.1719, + "loss": 0.1794, + "short_answer_loss": NaN, + "step": 113, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1852, + "grad_norm": 2.140625, + "learning_rate": 2.4988261653279815e-05, + "long_answer_loss": 0.1852, + "loss": 0.1769, + "short_answer_loss": NaN, + "step": 114, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.2302, + "grad_norm": 2.109375, + "learning_rate": 2.4987581420960253e-05, + "long_answer_loss": 0.2302, + "loss": 0.1863, + "short_answer_loss": NaN, + "step": 115, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.2028, + "grad_norm": 2.1875, + "learning_rate": 2.4986882040037994e-05, + "long_answer_loss": 0.2028, + "loss": 0.1779, + "short_answer_loss": NaN, + "step": 116, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1978, + "grad_norm": 2.28125, + "learning_rate": 2.4986163511585474e-05, + "long_answer_loss": 0.1978, + "loss": 0.1776, + "short_answer_loss": NaN, + "step": 117, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1804, + "grad_norm": 2.1875, + "learning_rate": 2.49854258367045e-05, + "long_answer_loss": 0.1804, + "loss": 0.1817, + "short_answer_loss": NaN, + "step": 118, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1766, + "grad_norm": 2.078125, + "learning_rate": 2.498466901652622e-05, + "long_answer_loss": 0.1766, + "loss": 0.1877, + "short_answer_loss": NaN, + "step": 119, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1824, + "grad_norm": 2.3125, + "learning_rate": 2.498389305221116e-05, + "long_answer_loss": 0.1824, + "loss": 0.1831, + "short_answer_loss": NaN, + "step": 120, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1967, + "grad_norm": 2.140625, + "learning_rate": 2.4983097944949187e-05, + "long_answer_loss": 0.1967, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 121, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1834, + "grad_norm": 2.171875, + "learning_rate": 2.4982283695959525e-05, + "long_answer_loss": 0.1834, + "loss": 0.1771, + "short_answer_loss": NaN, + "step": 122, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.1605, + "grad_norm": 2.15625, + "learning_rate": 2.4981450306490762e-05, + "long_answer_loss": 0.1605, + "loss": 0.1849, + "short_answer_loss": NaN, + "step": 123, + "template_loss": 0.0 + }, + { + "epoch": 0.09, + "full_loss": 0.175, + "grad_norm": 2.203125, + "learning_rate": 2.4980597777820826e-05, + "long_answer_loss": 0.175, + "loss": 0.1816, + "short_answer_loss": NaN, + "step": 124, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1695, + "grad_norm": 2.296875, + "learning_rate": 2.4979726111256983e-05, + "long_answer_loss": 0.1695, + "loss": 0.1809, + "short_answer_loss": NaN, + "step": 125, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1986, + "grad_norm": 2.3125, + "learning_rate": 2.4978835308135873e-05, + "long_answer_loss": 0.1986, + "loss": 0.1796, + "short_answer_loss": NaN, + "step": 126, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1911, + "grad_norm": 2.375, + "learning_rate": 2.497792536982345e-05, + "long_answer_loss": 0.1911, + "loss": 0.1875, + "short_answer_loss": NaN, + "step": 127, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1701, + "grad_norm": 2.390625, + "learning_rate": 2.4976996297715033e-05, + "long_answer_loss": 0.1701, + "loss": 0.1884, + "short_answer_loss": NaN, + "step": 128, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1756, + "grad_norm": 1.9609375, + "learning_rate": 2.4976048093235265e-05, + "long_answer_loss": 0.1756, + "loss": 0.1789, + "short_answer_loss": NaN, + "step": 129, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1673, + "grad_norm": 2.0625, + "learning_rate": 2.4975080757838145e-05, + "long_answer_loss": 0.1673, + "loss": 0.1807, + "short_answer_loss": NaN, + "step": 130, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1738, + "grad_norm": 1.96875, + "learning_rate": 2.497409429300698e-05, + "long_answer_loss": 0.1738, + "loss": 0.1792, + "short_answer_loss": NaN, + "step": 131, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1517, + "grad_norm": 1.8671875, + "learning_rate": 2.4973088700254437e-05, + "long_answer_loss": 0.1517, + "loss": 0.1699, + "short_answer_loss": NaN, + "step": 132, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1914, + "grad_norm": 2.125, + "learning_rate": 2.4972063981122508e-05, + "long_answer_loss": 0.1914, + "loss": 0.1816, + "short_answer_loss": NaN, + "step": 133, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1903, + "grad_norm": 1.90625, + "learning_rate": 2.4971020137182498e-05, + "long_answer_loss": 0.1903, + "loss": 0.1752, + "short_answer_loss": NaN, + "step": 134, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.2153, + "grad_norm": 2.125, + "learning_rate": 2.4969957170035056e-05, + "long_answer_loss": 0.2153, + "loss": 0.1838, + "short_answer_loss": NaN, + "step": 135, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1954, + "grad_norm": 2.046875, + "learning_rate": 2.4968875081310148e-05, + "long_answer_loss": 0.1954, + "loss": 0.1849, + "short_answer_loss": NaN, + "step": 136, + "template_loss": 0.0 + }, + { + "epoch": 0.1, + "full_loss": 0.1911, + "grad_norm": 1.921875, + "learning_rate": 2.4967773872667062e-05, + "long_answer_loss": 0.1911, + "loss": 0.1807, + "short_answer_loss": NaN, + "step": 137, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.2322, + "grad_norm": 2.1875, + "learning_rate": 2.4966653545794398e-05, + "long_answer_loss": 0.2322, + "loss": 0.1954, + "short_answer_loss": NaN, + "step": 138, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1962, + "grad_norm": 2.140625, + "learning_rate": 2.4965514102410083e-05, + "long_answer_loss": 0.1962, + "loss": 0.1933, + "short_answer_loss": NaN, + "step": 139, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1697, + "grad_norm": 2.078125, + "learning_rate": 2.4964355544261357e-05, + "long_answer_loss": 0.1697, + "loss": 0.1816, + "short_answer_loss": NaN, + "step": 140, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.2178, + "grad_norm": 2.03125, + "learning_rate": 2.496317787312476e-05, + "long_answer_loss": 0.2178, + "loss": 0.1807, + "short_answer_loss": NaN, + "step": 141, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1866, + "grad_norm": 2.015625, + "learning_rate": 2.4961981090806147e-05, + "long_answer_loss": 0.1866, + "loss": 0.1857, + "short_answer_loss": NaN, + "step": 142, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1596, + "grad_norm": 1.9453125, + "learning_rate": 2.4960765199140682e-05, + "long_answer_loss": 0.1596, + "loss": 0.18, + "short_answer_loss": NaN, + "step": 143, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.2018, + "grad_norm": 1.9765625, + "learning_rate": 2.4959530199992826e-05, + "long_answer_loss": 0.2018, + "loss": 0.1838, + "short_answer_loss": NaN, + "step": 144, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1635, + "grad_norm": 2.078125, + "learning_rate": 2.4958276095256335e-05, + "long_answer_loss": 0.1635, + "loss": 0.1834, + "short_answer_loss": NaN, + "step": 145, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.2144, + "grad_norm": 2.046875, + "learning_rate": 2.4957002886854277e-05, + "long_answer_loss": 0.2144, + "loss": 0.1835, + "short_answer_loss": NaN, + "step": 146, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1922, + "grad_norm": 2.09375, + "learning_rate": 2.4955710576739e-05, + "long_answer_loss": 0.1922, + "loss": 0.1844, + "short_answer_loss": NaN, + "step": 147, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1819, + "grad_norm": 1.8984375, + "learning_rate": 2.4954399166892152e-05, + "long_answer_loss": 0.1819, + "loss": 0.1708, + "short_answer_loss": NaN, + "step": 148, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.2108, + "grad_norm": 1.9453125, + "learning_rate": 2.495306865932465e-05, + "long_answer_loss": 0.2108, + "loss": 0.1799, + "short_answer_loss": NaN, + "step": 149, + "template_loss": 0.0 + }, + { + "epoch": 0.11, + "full_loss": 0.1585, + "grad_norm": 1.9375, + "learning_rate": 2.4951719056076728e-05, + "long_answer_loss": 0.1585, + "loss": 0.1703, + "short_answer_loss": NaN, + "step": 150, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1981, + "grad_norm": 2.015625, + "learning_rate": 2.495035035921787e-05, + "long_answer_loss": 0.1981, + "loss": 0.1838, + "short_answer_loss": NaN, + "step": 151, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1811, + "grad_norm": 2.015625, + "learning_rate": 2.4948962570846864e-05, + "long_answer_loss": 0.1811, + "loss": 0.181, + "short_answer_loss": NaN, + "step": 152, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1662, + "grad_norm": 1.875, + "learning_rate": 2.494755569309175e-05, + "long_answer_loss": 0.1662, + "loss": 0.1741, + "short_answer_loss": NaN, + "step": 153, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1624, + "grad_norm": 2.03125, + "learning_rate": 2.4946129728109854e-05, + "long_answer_loss": 0.1624, + "loss": 0.1794, + "short_answer_loss": NaN, + "step": 154, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1687, + "grad_norm": 1.859375, + "learning_rate": 2.494468467808777e-05, + "long_answer_loss": 0.1687, + "loss": 0.1754, + "short_answer_loss": NaN, + "step": 155, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1637, + "grad_norm": 1.90625, + "learning_rate": 2.4943220545241346e-05, + "long_answer_loss": 0.1637, + "loss": 0.1802, + "short_answer_loss": NaN, + "step": 156, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1815, + "grad_norm": 1.84375, + "learning_rate": 2.494173733181571e-05, + "long_answer_loss": 0.1815, + "loss": 0.1806, + "short_answer_loss": NaN, + "step": 157, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1729, + "grad_norm": 1.984375, + "learning_rate": 2.4940235040085243e-05, + "long_answer_loss": 0.1729, + "loss": 0.1866, + "short_answer_loss": NaN, + "step": 158, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1794, + "grad_norm": 1.90625, + "learning_rate": 2.493871367235356e-05, + "long_answer_loss": 0.1794, + "loss": 0.1808, + "short_answer_loss": NaN, + "step": 159, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1691, + "grad_norm": 1.96875, + "learning_rate": 2.4937173230953554e-05, + "long_answer_loss": 0.1691, + "loss": 0.1772, + "short_answer_loss": NaN, + "step": 160, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1698, + "grad_norm": 1.90625, + "learning_rate": 2.493561371824736e-05, + "long_answer_loss": 0.1698, + "loss": 0.19, + "short_answer_loss": NaN, + "step": 161, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1616, + "grad_norm": 1.8828125, + "learning_rate": 2.4934035136626338e-05, + "long_answer_loss": 0.1616, + "loss": 0.1851, + "short_answer_loss": NaN, + "step": 162, + "template_loss": 0.0 + }, + { + "epoch": 0.12, + "full_loss": 0.1754, + "grad_norm": 1.9921875, + "learning_rate": 2.493243748851112e-05, + "long_answer_loss": 0.1754, + "loss": 0.1789, + "short_answer_loss": NaN, + "step": 163, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1766, + "grad_norm": 2.015625, + "learning_rate": 2.4930820776351548e-05, + "long_answer_loss": 0.1766, + "loss": 0.1832, + "short_answer_loss": NaN, + "step": 164, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1541, + "grad_norm": 1.984375, + "learning_rate": 2.4929185002626714e-05, + "long_answer_loss": 0.1541, + "loss": 0.1781, + "short_answer_loss": NaN, + "step": 165, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.178, + "grad_norm": 2.046875, + "learning_rate": 2.492753016984493e-05, + "long_answer_loss": 0.178, + "loss": 0.1802, + "short_answer_loss": NaN, + "step": 166, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1843, + "grad_norm": 1.7890625, + "learning_rate": 2.492585628054373e-05, + "long_answer_loss": 0.1843, + "loss": 0.1885, + "short_answer_loss": NaN, + "step": 167, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1808, + "grad_norm": 2.0, + "learning_rate": 2.4924163337289885e-05, + "long_answer_loss": 0.1808, + "loss": 0.1821, + "short_answer_loss": NaN, + "step": 168, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.2042, + "grad_norm": 1.875, + "learning_rate": 2.4922451342679366e-05, + "long_answer_loss": 0.2042, + "loss": 0.1784, + "short_answer_loss": NaN, + "step": 169, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1893, + "grad_norm": 1.953125, + "learning_rate": 2.492072029933737e-05, + "long_answer_loss": 0.1893, + "loss": 0.1797, + "short_answer_loss": NaN, + "step": 170, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1749, + "grad_norm": 1.8359375, + "learning_rate": 2.4918970209918296e-05, + "long_answer_loss": 0.1749, + "loss": 0.1706, + "short_answer_loss": NaN, + "step": 171, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1807, + "grad_norm": 1.921875, + "learning_rate": 2.4917201077105757e-05, + "long_answer_loss": 0.1807, + "loss": 0.1828, + "short_answer_loss": NaN, + "step": 172, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1641, + "grad_norm": 1.90625, + "learning_rate": 2.4915412903612554e-05, + "long_answer_loss": 0.1641, + "loss": 0.1846, + "short_answer_loss": NaN, + "step": 173, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1789, + "grad_norm": 1.9453125, + "learning_rate": 2.4913605692180696e-05, + "long_answer_loss": 0.1789, + "loss": 0.1846, + "short_answer_loss": NaN, + "step": 174, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.1687, + "grad_norm": 1.8125, + "learning_rate": 2.4911779445581384e-05, + "long_answer_loss": 0.1687, + "loss": 0.1882, + "short_answer_loss": NaN, + "step": 175, + "template_loss": 0.0 + }, + { + "epoch": 0.13, + "full_loss": 0.181, + "grad_norm": 1.9296875, + "learning_rate": 2.4909934166615006e-05, + "long_answer_loss": 0.181, + "loss": 0.1751, + "short_answer_loss": NaN, + "step": 176, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1749, + "grad_norm": 2.0, + "learning_rate": 2.4908069858111133e-05, + "long_answer_loss": 0.1749, + "loss": 0.1779, + "short_answer_loss": NaN, + "step": 177, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1906, + "grad_norm": 2.171875, + "learning_rate": 2.4906186522928516e-05, + "long_answer_loss": 0.1906, + "loss": 0.1921, + "short_answer_loss": NaN, + "step": 178, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1641, + "grad_norm": 1.953125, + "learning_rate": 2.490428416395509e-05, + "long_answer_loss": 0.1641, + "loss": 0.1869, + "short_answer_loss": NaN, + "step": 179, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1781, + "grad_norm": 2.125, + "learning_rate": 2.490236278410794e-05, + "long_answer_loss": 0.1781, + "loss": 0.1886, + "short_answer_loss": NaN, + "step": 180, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1526, + "grad_norm": 2.046875, + "learning_rate": 2.490042238633335e-05, + "long_answer_loss": 0.1526, + "loss": 0.1808, + "short_answer_loss": NaN, + "step": 181, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1651, + "grad_norm": 1.8515625, + "learning_rate": 2.4898462973606736e-05, + "long_answer_loss": 0.1651, + "loss": 0.1718, + "short_answer_loss": NaN, + "step": 182, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1868, + "grad_norm": 1.953125, + "learning_rate": 2.4896484548932686e-05, + "long_answer_loss": 0.1868, + "loss": 0.1772, + "short_answer_loss": NaN, + "step": 183, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.175, + "grad_norm": 1.78125, + "learning_rate": 2.489448711534494e-05, + "long_answer_loss": 0.175, + "loss": 0.1753, + "short_answer_loss": NaN, + "step": 184, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.2003, + "grad_norm": 1.8515625, + "learning_rate": 2.4892470675906394e-05, + "long_answer_loss": 0.2003, + "loss": 0.1866, + "short_answer_loss": NaN, + "step": 185, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1635, + "grad_norm": 2.015625, + "learning_rate": 2.4890435233709066e-05, + "long_answer_loss": 0.1635, + "loss": 0.1782, + "short_answer_loss": NaN, + "step": 186, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1667, + "grad_norm": 1.8203125, + "learning_rate": 2.4888380791874137e-05, + "long_answer_loss": 0.1667, + "loss": 0.1785, + "short_answer_loss": NaN, + "step": 187, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1869, + "grad_norm": 2.125, + "learning_rate": 2.4886307353551906e-05, + "long_answer_loss": 0.1869, + "loss": 0.1782, + "short_answer_loss": NaN, + "step": 188, + "template_loss": 0.0 + }, + { + "epoch": 0.14, + "full_loss": 0.1848, + "grad_norm": 1.84375, + "learning_rate": 2.4884214921921813e-05, + "long_answer_loss": 0.1848, + "loss": 0.1631, + "short_answer_loss": NaN, + "step": 189, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1732, + "grad_norm": 1.859375, + "learning_rate": 2.4882103500192415e-05, + "long_answer_loss": 0.1732, + "loss": 0.1824, + "short_answer_loss": NaN, + "step": 190, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.2009, + "grad_norm": 2.015625, + "learning_rate": 2.4879973091601387e-05, + "long_answer_loss": 0.2009, + "loss": 0.1801, + "short_answer_loss": NaN, + "step": 191, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1753, + "grad_norm": 1.9296875, + "learning_rate": 2.487782369941553e-05, + "long_answer_loss": 0.1753, + "loss": 0.1782, + "short_answer_loss": NaN, + "step": 192, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1967, + "grad_norm": 2.046875, + "learning_rate": 2.4875655326930736e-05, + "long_answer_loss": 0.1967, + "loss": 0.1889, + "short_answer_loss": NaN, + "step": 193, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.2104, + "grad_norm": 1.90625, + "learning_rate": 2.4873467977472025e-05, + "long_answer_loss": 0.2104, + "loss": 0.1812, + "short_answer_loss": NaN, + "step": 194, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1992, + "grad_norm": 1.7265625, + "learning_rate": 2.487126165439349e-05, + "long_answer_loss": 0.1992, + "loss": 0.1791, + "short_answer_loss": NaN, + "step": 195, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1856, + "grad_norm": 2.375, + "learning_rate": 2.4869036361078345e-05, + "long_answer_loss": 0.1856, + "loss": 0.1859, + "short_answer_loss": NaN, + "step": 196, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1755, + "grad_norm": 1.875, + "learning_rate": 2.486679210093888e-05, + "long_answer_loss": 0.1755, + "loss": 0.169, + "short_answer_loss": NaN, + "step": 197, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.183, + "grad_norm": 1.78125, + "learning_rate": 2.486452887741646e-05, + "long_answer_loss": 0.183, + "loss": 0.1848, + "short_answer_loss": NaN, + "step": 198, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1781, + "grad_norm": 2.03125, + "learning_rate": 2.4862246693981544e-05, + "long_answer_loss": 0.1781, + "loss": 0.1751, + "short_answer_loss": NaN, + "step": 199, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.1652, + "grad_norm": 1.8046875, + "learning_rate": 2.4859945554133662e-05, + "long_answer_loss": 0.1652, + "loss": 0.1812, + "short_answer_loss": NaN, + "step": 200, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.177, + "grad_norm": 1.7421875, + "learning_rate": 2.4857625461401404e-05, + "long_answer_loss": 0.177, + "loss": 0.1734, + "short_answer_loss": NaN, + "step": 201, + "template_loss": 0.0 + }, + { + "epoch": 0.15, + "full_loss": 0.196, + "grad_norm": 1.875, + "learning_rate": 2.4855286419342428e-05, + "long_answer_loss": 0.196, + "loss": 0.1882, + "short_answer_loss": NaN, + "step": 202, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.2024, + "grad_norm": 1.7421875, + "learning_rate": 2.485292843154345e-05, + "long_answer_loss": 0.2024, + "loss": 0.181, + "short_answer_loss": NaN, + "step": 203, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1584, + "grad_norm": 1.6796875, + "learning_rate": 2.4850551501620235e-05, + "long_answer_loss": 0.1584, + "loss": 0.1746, + "short_answer_loss": NaN, + "step": 204, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.179, + "grad_norm": 1.7421875, + "learning_rate": 2.484815563321759e-05, + "long_answer_loss": 0.179, + "loss": 0.1755, + "short_answer_loss": NaN, + "step": 205, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1756, + "grad_norm": 1.9765625, + "learning_rate": 2.484574083000938e-05, + "long_answer_loss": 0.1756, + "loss": 0.1772, + "short_answer_loss": NaN, + "step": 206, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1868, + "grad_norm": 1.7890625, + "learning_rate": 2.4843307095698476e-05, + "long_answer_loss": 0.1868, + "loss": 0.1763, + "short_answer_loss": NaN, + "step": 207, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1768, + "grad_norm": 1.6328125, + "learning_rate": 2.4840854434016808e-05, + "long_answer_loss": 0.1768, + "loss": 0.1763, + "short_answer_loss": NaN, + "step": 208, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.2088, + "grad_norm": 1.75, + "learning_rate": 2.4838382848725312e-05, + "long_answer_loss": 0.2088, + "loss": 0.1811, + "short_answer_loss": NaN, + "step": 209, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1809, + "grad_norm": 1.7578125, + "learning_rate": 2.4835892343613943e-05, + "long_answer_loss": 0.1809, + "loss": 0.178, + "short_answer_loss": NaN, + "step": 210, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.158, + "grad_norm": 1.78125, + "learning_rate": 2.4833382922501668e-05, + "long_answer_loss": 0.158, + "loss": 0.1679, + "short_answer_loss": NaN, + "step": 211, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1787, + "grad_norm": 1.8125, + "learning_rate": 2.4830854589236475e-05, + "long_answer_loss": 0.1787, + "loss": 0.1809, + "short_answer_loss": NaN, + "step": 212, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1763, + "grad_norm": 1.7890625, + "learning_rate": 2.4828307347695326e-05, + "long_answer_loss": 0.1763, + "loss": 0.1729, + "short_answer_loss": NaN, + "step": 213, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1905, + "grad_norm": 1.8203125, + "learning_rate": 2.4825741201784198e-05, + "long_answer_loss": 0.1905, + "loss": 0.1758, + "short_answer_loss": NaN, + "step": 214, + "template_loss": 0.0 + }, + { + "epoch": 0.16, + "full_loss": 0.1986, + "grad_norm": 1.9921875, + "learning_rate": 2.482315615543805e-05, + "long_answer_loss": 0.1986, + "loss": 0.178, + "short_answer_loss": NaN, + "step": 215, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1832, + "grad_norm": 1.71875, + "learning_rate": 2.482055221262081e-05, + "long_answer_loss": 0.1832, + "loss": 0.1813, + "short_answer_loss": NaN, + "step": 216, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1984, + "grad_norm": 1.90625, + "learning_rate": 2.4817929377325413e-05, + "long_answer_loss": 0.1984, + "loss": 0.179, + "short_answer_loss": NaN, + "step": 217, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.2032, + "grad_norm": 1.625, + "learning_rate": 2.4815287653573733e-05, + "long_answer_loss": 0.2032, + "loss": 0.1802, + "short_answer_loss": NaN, + "step": 218, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1822, + "grad_norm": 1.7734375, + "learning_rate": 2.4812627045416623e-05, + "long_answer_loss": 0.1822, + "loss": 0.182, + "short_answer_loss": NaN, + "step": 219, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1751, + "grad_norm": 1.65625, + "learning_rate": 2.4809947556933886e-05, + "long_answer_loss": 0.1751, + "loss": 0.177, + "short_answer_loss": NaN, + "step": 220, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.158, + "grad_norm": 1.6484375, + "learning_rate": 2.4807249192234293e-05, + "long_answer_loss": 0.158, + "loss": 0.1713, + "short_answer_loss": NaN, + "step": 221, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1628, + "grad_norm": 1.625, + "learning_rate": 2.4804531955455534e-05, + "long_answer_loss": 0.1628, + "loss": 0.1689, + "short_answer_loss": NaN, + "step": 222, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.207, + "grad_norm": 1.6796875, + "learning_rate": 2.480179585076426e-05, + "long_answer_loss": 0.207, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 223, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1636, + "grad_norm": 2.0, + "learning_rate": 2.4799040882356044e-05, + "long_answer_loss": 0.1636, + "loss": 0.1736, + "short_answer_loss": NaN, + "step": 224, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1432, + "grad_norm": 1.9375, + "learning_rate": 2.4796267054455384e-05, + "long_answer_loss": 0.1432, + "loss": 0.1869, + "short_answer_loss": NaN, + "step": 225, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1877, + "grad_norm": 1.9609375, + "learning_rate": 2.47934743713157e-05, + "long_answer_loss": 0.1877, + "loss": 0.1771, + "short_answer_loss": NaN, + "step": 226, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1752, + "grad_norm": 1.8203125, + "learning_rate": 2.479066283721933e-05, + "long_answer_loss": 0.1752, + "loss": 0.18, + "short_answer_loss": NaN, + "step": 227, + "template_loss": 0.0 + }, + { + "epoch": 0.17, + "full_loss": 0.1989, + "grad_norm": 1.8046875, + "learning_rate": 2.478783245647751e-05, + "long_answer_loss": 0.1989, + "loss": 0.1752, + "short_answer_loss": NaN, + "step": 228, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1757, + "grad_norm": 1.6796875, + "learning_rate": 2.4784983233430375e-05, + "long_answer_loss": 0.1757, + "loss": 0.1711, + "short_answer_loss": NaN, + "step": 229, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1674, + "grad_norm": 1.7890625, + "learning_rate": 2.4782115172446966e-05, + "long_answer_loss": 0.1674, + "loss": 0.1734, + "short_answer_loss": NaN, + "step": 230, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.2165, + "grad_norm": 1.7890625, + "learning_rate": 2.4779228277925193e-05, + "long_answer_loss": 0.2165, + "loss": 0.1832, + "short_answer_loss": NaN, + "step": 231, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.178, + "grad_norm": 1.7734375, + "learning_rate": 2.4776322554291854e-05, + "long_answer_loss": 0.178, + "loss": 0.175, + "short_answer_loss": NaN, + "step": 232, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1577, + "grad_norm": 1.6875, + "learning_rate": 2.4773398006002625e-05, + "long_answer_loss": 0.1577, + "loss": 0.1708, + "short_answer_loss": NaN, + "step": 233, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1855, + "grad_norm": 1.8671875, + "learning_rate": 2.4770454637542035e-05, + "long_answer_loss": 0.1855, + "loss": 0.182, + "short_answer_loss": NaN, + "step": 234, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1642, + "grad_norm": 1.734375, + "learning_rate": 2.4767492453423487e-05, + "long_answer_loss": 0.1642, + "loss": 0.1778, + "short_answer_loss": NaN, + "step": 235, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1711, + "grad_norm": 1.859375, + "learning_rate": 2.4764511458189222e-05, + "long_answer_loss": 0.1711, + "loss": 0.1804, + "short_answer_loss": NaN, + "step": 236, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1805, + "grad_norm": 2.015625, + "learning_rate": 2.4761511656410334e-05, + "long_answer_loss": 0.1805, + "loss": 0.1823, + "short_answer_loss": NaN, + "step": 237, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1689, + "grad_norm": 1.765625, + "learning_rate": 2.4758493052686758e-05, + "long_answer_loss": 0.1689, + "loss": 0.1765, + "short_answer_loss": NaN, + "step": 238, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1683, + "grad_norm": 1.6640625, + "learning_rate": 2.4755455651647255e-05, + "long_answer_loss": 0.1683, + "loss": 0.1751, + "short_answer_loss": NaN, + "step": 239, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1831, + "grad_norm": 1.90625, + "learning_rate": 2.475239945794941e-05, + "long_answer_loss": 0.1831, + "loss": 0.178, + "short_answer_loss": NaN, + "step": 240, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.1917, + "grad_norm": 1.734375, + "learning_rate": 2.4749324476279622e-05, + "long_answer_loss": 0.1917, + "loss": 0.1782, + "short_answer_loss": NaN, + "step": 241, + "template_loss": 0.0 + }, + { + "epoch": 0.18, + "full_loss": 0.202, + "grad_norm": 1.65625, + "learning_rate": 2.4746230711353115e-05, + "long_answer_loss": 0.202, + "loss": 0.1761, + "short_answer_loss": NaN, + "step": 242, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1779, + "grad_norm": 1.7421875, + "learning_rate": 2.4743118167913893e-05, + "long_answer_loss": 0.1779, + "loss": 0.1727, + "short_answer_loss": NaN, + "step": 243, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1835, + "grad_norm": 1.7109375, + "learning_rate": 2.4739986850734768e-05, + "long_answer_loss": 0.1835, + "loss": 0.1834, + "short_answer_loss": NaN, + "step": 244, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1973, + "grad_norm": 1.7734375, + "learning_rate": 2.473683676461734e-05, + "long_answer_loss": 0.1973, + "loss": 0.1825, + "short_answer_loss": NaN, + "step": 245, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.16, + "grad_norm": 1.671875, + "learning_rate": 2.473366791439199e-05, + "long_answer_loss": 0.16, + "loss": 0.1705, + "short_answer_loss": NaN, + "step": 246, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1712, + "grad_norm": 1.984375, + "learning_rate": 2.473048030491787e-05, + "long_answer_loss": 0.1712, + "loss": 0.1697, + "short_answer_loss": NaN, + "step": 247, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1704, + "grad_norm": 1.9296875, + "learning_rate": 2.472727394108289e-05, + "long_answer_loss": 0.1704, + "loss": 0.1754, + "short_answer_loss": NaN, + "step": 248, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1551, + "grad_norm": 1.8671875, + "learning_rate": 2.4724048827803738e-05, + "long_answer_loss": 0.1551, + "loss": 0.173, + "short_answer_loss": NaN, + "step": 249, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1998, + "grad_norm": 1.8984375, + "learning_rate": 2.4720804970025827e-05, + "long_answer_loss": 0.1998, + "loss": 0.1799, + "short_answer_loss": NaN, + "step": 250, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1711, + "grad_norm": 1.8046875, + "learning_rate": 2.4717542372723333e-05, + "long_answer_loss": 0.1711, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 251, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1787, + "grad_norm": 1.671875, + "learning_rate": 2.471426104089916e-05, + "long_answer_loss": 0.1787, + "loss": 0.1786, + "short_answer_loss": NaN, + "step": 252, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.2084, + "grad_norm": 1.75, + "learning_rate": 2.4710960979584945e-05, + "long_answer_loss": 0.2084, + "loss": 0.1703, + "short_answer_loss": NaN, + "step": 253, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1814, + "grad_norm": 1.703125, + "learning_rate": 2.4707642193841036e-05, + "long_answer_loss": 0.1814, + "loss": 0.1756, + "short_answer_loss": NaN, + "step": 254, + "template_loss": 0.0 + }, + { + "epoch": 0.19, + "full_loss": 0.1797, + "grad_norm": 1.765625, + "learning_rate": 2.470430468875649e-05, + "long_answer_loss": 0.1797, + "loss": 0.1834, + "short_answer_loss": NaN, + "step": 255, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1801, + "grad_norm": 1.765625, + "learning_rate": 2.4700948469449092e-05, + "long_answer_loss": 0.1801, + "loss": 0.1803, + "short_answer_loss": NaN, + "step": 256, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1742, + "grad_norm": 1.7265625, + "learning_rate": 2.4697573541065295e-05, + "long_answer_loss": 0.1742, + "loss": 0.1847, + "short_answer_loss": NaN, + "step": 257, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1778, + "grad_norm": 1.6484375, + "learning_rate": 2.4694179908780257e-05, + "long_answer_loss": 0.1778, + "loss": 0.1741, + "short_answer_loss": NaN, + "step": 258, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1916, + "grad_norm": 1.890625, + "learning_rate": 2.469076757779782e-05, + "long_answer_loss": 0.1916, + "loss": 0.1832, + "short_answer_loss": NaN, + "step": 259, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1921, + "grad_norm": 1.6875, + "learning_rate": 2.4687336553350482e-05, + "long_answer_loss": 0.1921, + "loss": 0.1746, + "short_answer_loss": NaN, + "step": 260, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1907, + "grad_norm": 1.875, + "learning_rate": 2.4683886840699422e-05, + "long_answer_loss": 0.1907, + "loss": 0.1784, + "short_answer_loss": NaN, + "step": 261, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1727, + "grad_norm": 1.65625, + "learning_rate": 2.4680418445134463e-05, + "long_answer_loss": 0.1727, + "loss": 0.1688, + "short_answer_loss": NaN, + "step": 262, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1382, + "grad_norm": 1.8671875, + "learning_rate": 2.4676931371974094e-05, + "long_answer_loss": 0.1382, + "loss": 0.1743, + "short_answer_loss": NaN, + "step": 263, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1783, + "grad_norm": 1.7578125, + "learning_rate": 2.467342562656542e-05, + "long_answer_loss": 0.1783, + "loss": 0.1789, + "short_answer_loss": NaN, + "step": 264, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.169, + "grad_norm": 1.8046875, + "learning_rate": 2.466990121428421e-05, + "long_answer_loss": 0.169, + "loss": 0.1765, + "short_answer_loss": NaN, + "step": 265, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1932, + "grad_norm": 1.9140625, + "learning_rate": 2.4666358140534817e-05, + "long_answer_loss": 0.1932, + "loss": 0.1779, + "short_answer_loss": NaN, + "step": 266, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1589, + "grad_norm": 1.7421875, + "learning_rate": 2.466279641075025e-05, + "long_answer_loss": 0.1589, + "loss": 0.1763, + "short_answer_loss": NaN, + "step": 267, + "template_loss": 0.0 + }, + { + "epoch": 0.2, + "full_loss": 0.1768, + "grad_norm": 1.8046875, + "learning_rate": 2.4659216030392098e-05, + "long_answer_loss": 0.1768, + "loss": 0.1778, + "short_answer_loss": NaN, + "step": 268, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1877, + "grad_norm": 1.6328125, + "learning_rate": 2.4655617004950553e-05, + "long_answer_loss": 0.1877, + "loss": 0.1734, + "short_answer_loss": NaN, + "step": 269, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1784, + "grad_norm": 1.6640625, + "learning_rate": 2.4651999339944416e-05, + "long_answer_loss": 0.1784, + "loss": 0.1822, + "short_answer_loss": NaN, + "step": 270, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1583, + "grad_norm": 1.9296875, + "learning_rate": 2.4648363040921047e-05, + "long_answer_loss": 0.1583, + "loss": 0.1836, + "short_answer_loss": NaN, + "step": 271, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1831, + "grad_norm": 1.6015625, + "learning_rate": 2.4644708113456394e-05, + "long_answer_loss": 0.1831, + "loss": 0.1728, + "short_answer_loss": NaN, + "step": 272, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1654, + "grad_norm": 1.6640625, + "learning_rate": 2.4641034563154957e-05, + "long_answer_loss": 0.1654, + "loss": 0.1671, + "short_answer_loss": NaN, + "step": 273, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1872, + "grad_norm": 1.7734375, + "learning_rate": 2.4637342395649815e-05, + "long_answer_loss": 0.1872, + "loss": 0.1793, + "short_answer_loss": NaN, + "step": 274, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1772, + "grad_norm": 1.6875, + "learning_rate": 2.4633631616602566e-05, + "long_answer_loss": 0.1772, + "loss": 0.169, + "short_answer_loss": NaN, + "step": 275, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.2241, + "grad_norm": 1.7578125, + "learning_rate": 2.462990223170337e-05, + "long_answer_loss": 0.2241, + "loss": 0.1874, + "short_answer_loss": NaN, + "step": 276, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1745, + "grad_norm": 1.5390625, + "learning_rate": 2.4626154246670908e-05, + "long_answer_loss": 0.1745, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 277, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.154, + "grad_norm": 1.6875, + "learning_rate": 2.4622387667252384e-05, + "long_answer_loss": 0.154, + "loss": 0.1699, + "short_answer_loss": NaN, + "step": 278, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1981, + "grad_norm": 1.8125, + "learning_rate": 2.4618602499223513e-05, + "long_answer_loss": 0.1981, + "loss": 0.1817, + "short_answer_loss": NaN, + "step": 279, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1763, + "grad_norm": 1.703125, + "learning_rate": 2.4614798748388518e-05, + "long_answer_loss": 0.1763, + "loss": 0.1787, + "short_answer_loss": NaN, + "step": 280, + "template_loss": 0.0 + }, + { + "epoch": 0.21, + "full_loss": 0.1851, + "grad_norm": 1.6640625, + "learning_rate": 2.461097642058011e-05, + "long_answer_loss": 0.1851, + "loss": 0.177, + "short_answer_loss": NaN, + "step": 281, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1804, + "grad_norm": 1.75, + "learning_rate": 2.4607135521659497e-05, + "long_answer_loss": 0.1804, + "loss": 0.1798, + "short_answer_loss": NaN, + "step": 282, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1867, + "grad_norm": 1.7265625, + "learning_rate": 2.4603276057516356e-05, + "long_answer_loss": 0.1867, + "loss": 0.1818, + "short_answer_loss": NaN, + "step": 283, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1855, + "grad_norm": 1.765625, + "learning_rate": 2.4599398034068836e-05, + "long_answer_loss": 0.1855, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 284, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1853, + "grad_norm": 1.671875, + "learning_rate": 2.4595501457263538e-05, + "long_answer_loss": 0.1853, + "loss": 0.1715, + "short_answer_loss": NaN, + "step": 285, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1618, + "grad_norm": 1.578125, + "learning_rate": 2.4591586333075522e-05, + "long_answer_loss": 0.1618, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 286, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1556, + "grad_norm": 1.7734375, + "learning_rate": 2.4587652667508282e-05, + "long_answer_loss": 0.1556, + "loss": 0.1676, + "short_answer_loss": NaN, + "step": 287, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.196, + "grad_norm": 1.6328125, + "learning_rate": 2.458370046659375e-05, + "long_answer_loss": 0.196, + "loss": 0.1759, + "short_answer_loss": NaN, + "step": 288, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1929, + "grad_norm": 1.7578125, + "learning_rate": 2.457972973639228e-05, + "long_answer_loss": 0.1929, + "loss": 0.1803, + "short_answer_loss": NaN, + "step": 289, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1833, + "grad_norm": 1.9609375, + "learning_rate": 2.4575740482992625e-05, + "long_answer_loss": 0.1833, + "loss": 0.1831, + "short_answer_loss": NaN, + "step": 290, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.175, + "grad_norm": 1.609375, + "learning_rate": 2.4571732712511967e-05, + "long_answer_loss": 0.175, + "loss": 0.177, + "short_answer_loss": NaN, + "step": 291, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1867, + "grad_norm": 1.7890625, + "learning_rate": 2.4567706431095855e-05, + "long_answer_loss": 0.1867, + "loss": 0.1814, + "short_answer_loss": NaN, + "step": 292, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1801, + "grad_norm": 1.6484375, + "learning_rate": 2.456366164491824e-05, + "long_answer_loss": 0.1801, + "loss": 0.1825, + "short_answer_loss": NaN, + "step": 293, + "template_loss": 0.0 + }, + { + "epoch": 0.22, + "full_loss": 0.1977, + "grad_norm": 1.8515625, + "learning_rate": 2.455959836018145e-05, + "long_answer_loss": 0.1977, + "loss": 0.1818, + "short_answer_loss": NaN, + "step": 294, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1498, + "grad_norm": 1.5703125, + "learning_rate": 2.4555516583116166e-05, + "long_answer_loss": 0.1498, + "loss": 0.1694, + "short_answer_loss": NaN, + "step": 295, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1881, + "grad_norm": 1.515625, + "learning_rate": 2.4551416319981435e-05, + "long_answer_loss": 0.1881, + "loss": 0.1753, + "short_answer_loss": NaN, + "step": 296, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1579, + "grad_norm": 1.6640625, + "learning_rate": 2.4547297577064648e-05, + "long_answer_loss": 0.1579, + "loss": 0.1644, + "short_answer_loss": NaN, + "step": 297, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1872, + "grad_norm": 1.5625, + "learning_rate": 2.4543160360681533e-05, + "long_answer_loss": 0.1872, + "loss": 0.1737, + "short_answer_loss": NaN, + "step": 298, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1537, + "grad_norm": 1.640625, + "learning_rate": 2.4539004677176147e-05, + "long_answer_loss": 0.1537, + "loss": 0.1726, + "short_answer_loss": NaN, + "step": 299, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1694, + "grad_norm": 1.7890625, + "learning_rate": 2.453483053292086e-05, + "long_answer_loss": 0.1694, + "loss": 0.177, + "short_answer_loss": NaN, + "step": 300, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1746, + "grad_norm": 1.6875, + "learning_rate": 2.453063793431636e-05, + "long_answer_loss": 0.1746, + "loss": 0.169, + "short_answer_loss": NaN, + "step": 301, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1845, + "grad_norm": 1.765625, + "learning_rate": 2.4526426887791618e-05, + "long_answer_loss": 0.1845, + "loss": 0.177, + "short_answer_loss": NaN, + "step": 302, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1717, + "grad_norm": 1.703125, + "learning_rate": 2.452219739980391e-05, + "long_answer_loss": 0.1717, + "loss": 0.178, + "short_answer_loss": NaN, + "step": 303, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.2011, + "grad_norm": 1.7734375, + "learning_rate": 2.4517949476838775e-05, + "long_answer_loss": 0.2011, + "loss": 0.176, + "short_answer_loss": NaN, + "step": 304, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1782, + "grad_norm": 1.6640625, + "learning_rate": 2.451368312541003e-05, + "long_answer_loss": 0.1782, + "loss": 0.1744, + "short_answer_loss": NaN, + "step": 305, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1608, + "grad_norm": 1.609375, + "learning_rate": 2.4509398352059755e-05, + "long_answer_loss": 0.1608, + "loss": 0.167, + "short_answer_loss": NaN, + "step": 306, + "template_loss": 0.0 + }, + { + "epoch": 0.23, + "full_loss": 0.1702, + "grad_norm": 1.78125, + "learning_rate": 2.450509516335826e-05, + "long_answer_loss": 0.1702, + "loss": 0.174, + "short_answer_loss": NaN, + "step": 307, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1792, + "grad_norm": 1.5625, + "learning_rate": 2.450077356590411e-05, + "long_answer_loss": 0.1792, + "loss": 0.1678, + "short_answer_loss": NaN, + "step": 308, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1848, + "grad_norm": 1.6328125, + "learning_rate": 2.449643356632409e-05, + "long_answer_loss": 0.1848, + "loss": 0.1747, + "short_answer_loss": NaN, + "step": 309, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1703, + "grad_norm": 1.75, + "learning_rate": 2.4492075171273213e-05, + "long_answer_loss": 0.1703, + "loss": 0.1725, + "short_answer_loss": NaN, + "step": 310, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1823, + "grad_norm": 1.7578125, + "learning_rate": 2.4487698387434687e-05, + "long_answer_loss": 0.1823, + "loss": 0.169, + "short_answer_loss": NaN, + "step": 311, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1899, + "grad_norm": 1.640625, + "learning_rate": 2.4483303221519924e-05, + "long_answer_loss": 0.1899, + "loss": 0.1765, + "short_answer_loss": NaN, + "step": 312, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1695, + "grad_norm": 1.7265625, + "learning_rate": 2.4478889680268525e-05, + "long_answer_loss": 0.1695, + "loss": 0.1677, + "short_answer_loss": NaN, + "step": 313, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1753, + "grad_norm": 1.6484375, + "learning_rate": 2.447445777044826e-05, + "long_answer_loss": 0.1753, + "loss": 0.1715, + "short_answer_loss": NaN, + "step": 314, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1616, + "grad_norm": 1.828125, + "learning_rate": 2.4470007498855074e-05, + "long_answer_loss": 0.1616, + "loss": 0.176, + "short_answer_loss": NaN, + "step": 315, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1811, + "grad_norm": 1.6015625, + "learning_rate": 2.446553887231307e-05, + "long_answer_loss": 0.1811, + "loss": 0.1744, + "short_answer_loss": NaN, + "step": 316, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1735, + "grad_norm": 1.609375, + "learning_rate": 2.4461051897674487e-05, + "long_answer_loss": 0.1735, + "loss": 0.1753, + "short_answer_loss": NaN, + "step": 317, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1725, + "grad_norm": 1.6484375, + "learning_rate": 2.44565465818197e-05, + "long_answer_loss": 0.1725, + "loss": 0.1725, + "short_answer_loss": NaN, + "step": 318, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1573, + "grad_norm": 1.546875, + "learning_rate": 2.4452022931657227e-05, + "long_answer_loss": 0.1573, + "loss": 0.1712, + "short_answer_loss": NaN, + "step": 319, + "template_loss": 0.0 + }, + { + "epoch": 0.24, + "full_loss": 0.1584, + "grad_norm": 1.5234375, + "learning_rate": 2.444748095412367e-05, + "long_answer_loss": 0.1584, + "loss": 0.1666, + "short_answer_loss": NaN, + "step": 320, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1572, + "grad_norm": 1.703125, + "learning_rate": 2.4442920656183753e-05, + "long_answer_loss": 0.1572, + "loss": 0.1697, + "short_answer_loss": NaN, + "step": 321, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.153, + "grad_norm": 1.6484375, + "learning_rate": 2.44383420448303e-05, + "long_answer_loss": 0.153, + "loss": 0.1679, + "short_answer_loss": NaN, + "step": 322, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.2001, + "grad_norm": 1.6328125, + "learning_rate": 2.44337451270842e-05, + "long_answer_loss": 0.2001, + "loss": 0.1671, + "short_answer_loss": NaN, + "step": 323, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1627, + "grad_norm": 1.828125, + "learning_rate": 2.442912990999442e-05, + "long_answer_loss": 0.1627, + "loss": 0.1815, + "short_answer_loss": NaN, + "step": 324, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1915, + "grad_norm": 1.765625, + "learning_rate": 2.442449640063799e-05, + "long_answer_loss": 0.1915, + "loss": 0.1767, + "short_answer_loss": NaN, + "step": 325, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1976, + "grad_norm": 1.6875, + "learning_rate": 2.4419844606119982e-05, + "long_answer_loss": 0.1976, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 326, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1881, + "grad_norm": 1.5546875, + "learning_rate": 2.4415174533573516e-05, + "long_answer_loss": 0.1881, + "loss": 0.1681, + "short_answer_loss": NaN, + "step": 327, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1629, + "grad_norm": 1.7265625, + "learning_rate": 2.4410486190159738e-05, + "long_answer_loss": 0.1629, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 328, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1612, + "grad_norm": 1.578125, + "learning_rate": 2.4405779583067803e-05, + "long_answer_loss": 0.1612, + "loss": 0.1657, + "short_answer_loss": NaN, + "step": 329, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1883, + "grad_norm": 1.59375, + "learning_rate": 2.440105471951488e-05, + "long_answer_loss": 0.1883, + "loss": 0.1729, + "short_answer_loss": NaN, + "step": 330, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1574, + "grad_norm": 1.6875, + "learning_rate": 2.439631160674613e-05, + "long_answer_loss": 0.1574, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 331, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1602, + "grad_norm": 1.6796875, + "learning_rate": 2.4391550252034696e-05, + "long_answer_loss": 0.1602, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 332, + "template_loss": 0.0 + }, + { + "epoch": 0.25, + "full_loss": 0.1489, + "grad_norm": 1.6796875, + "learning_rate": 2.4386770662681698e-05, + "long_answer_loss": 0.1489, + "loss": 0.1778, + "short_answer_loss": NaN, + "step": 333, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1727, + "grad_norm": 1.71875, + "learning_rate": 2.4381972846016204e-05, + "long_answer_loss": 0.1727, + "loss": 0.1757, + "short_answer_loss": NaN, + "step": 334, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1515, + "grad_norm": 1.65625, + "learning_rate": 2.4377156809395256e-05, + "long_answer_loss": 0.1515, + "loss": 0.159, + "short_answer_loss": NaN, + "step": 335, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.157, + "grad_norm": 1.625, + "learning_rate": 2.4372322560203814e-05, + "long_answer_loss": 0.157, + "loss": 0.1645, + "short_answer_loss": NaN, + "step": 336, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1804, + "grad_norm": 1.5703125, + "learning_rate": 2.4367470105854766e-05, + "long_answer_loss": 0.1804, + "loss": 0.1709, + "short_answer_loss": NaN, + "step": 337, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1665, + "grad_norm": 1.59375, + "learning_rate": 2.436259945378893e-05, + "long_answer_loss": 0.1665, + "loss": 0.179, + "short_answer_loss": NaN, + "step": 338, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1861, + "grad_norm": 1.6875, + "learning_rate": 2.4357710611475022e-05, + "long_answer_loss": 0.1861, + "loss": 0.1735, + "short_answer_loss": NaN, + "step": 339, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1816, + "grad_norm": 1.671875, + "learning_rate": 2.4352803586409644e-05, + "long_answer_loss": 0.1816, + "loss": 0.1744, + "short_answer_loss": NaN, + "step": 340, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1678, + "grad_norm": 1.53125, + "learning_rate": 2.4347878386117287e-05, + "long_answer_loss": 0.1678, + "loss": 0.1712, + "short_answer_loss": NaN, + "step": 341, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1804, + "grad_norm": 1.65625, + "learning_rate": 2.434293501815031e-05, + "long_answer_loss": 0.1804, + "loss": 0.1749, + "short_answer_loss": NaN, + "step": 342, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1898, + "grad_norm": 1.5703125, + "learning_rate": 2.4337973490088932e-05, + "long_answer_loss": 0.1898, + "loss": 0.1681, + "short_answer_loss": NaN, + "step": 343, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1709, + "grad_norm": 1.609375, + "learning_rate": 2.4332993809541222e-05, + "long_answer_loss": 0.1709, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 344, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1666, + "grad_norm": 1.6953125, + "learning_rate": 2.432799598414307e-05, + "long_answer_loss": 0.1666, + "loss": 0.1689, + "short_answer_loss": NaN, + "step": 345, + "template_loss": 0.0 + }, + { + "epoch": 0.26, + "full_loss": 0.1463, + "grad_norm": 1.5546875, + "learning_rate": 2.4322980021558208e-05, + "long_answer_loss": 0.1463, + "loss": 0.1599, + "short_answer_loss": NaN, + "step": 346, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1684, + "grad_norm": 1.6796875, + "learning_rate": 2.4317945929478167e-05, + "long_answer_loss": 0.1684, + "loss": 0.1701, + "short_answer_loss": NaN, + "step": 347, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.2068, + "grad_norm": 1.75, + "learning_rate": 2.4312893715622287e-05, + "long_answer_loss": 0.2068, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 348, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1742, + "grad_norm": 1.6015625, + "learning_rate": 2.4307823387737688e-05, + "long_answer_loss": 0.1742, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 349, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1693, + "grad_norm": 1.734375, + "learning_rate": 2.4302734953599267e-05, + "long_answer_loss": 0.1693, + "loss": 0.1699, + "short_answer_loss": NaN, + "step": 350, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1693, + "grad_norm": 1.625, + "learning_rate": 2.4297628421009696e-05, + "long_answer_loss": 0.1693, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 351, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.2014, + "grad_norm": 1.765625, + "learning_rate": 2.4292503797799387e-05, + "long_answer_loss": 0.2014, + "loss": 0.1809, + "short_answer_loss": NaN, + "step": 352, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1607, + "grad_norm": 1.5234375, + "learning_rate": 2.4287361091826493e-05, + "long_answer_loss": 0.1607, + "loss": 0.1639, + "short_answer_loss": NaN, + "step": 353, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1781, + "grad_norm": 1.59375, + "learning_rate": 2.4282200310976908e-05, + "long_answer_loss": 0.1781, + "loss": 0.1739, + "short_answer_loss": NaN, + "step": 354, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1875, + "grad_norm": 1.5625, + "learning_rate": 2.4277021463164225e-05, + "long_answer_loss": 0.1875, + "loss": 0.1692, + "short_answer_loss": NaN, + "step": 355, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1818, + "grad_norm": 1.6796875, + "learning_rate": 2.427182455632976e-05, + "long_answer_loss": 0.1818, + "loss": 0.1769, + "short_answer_loss": NaN, + "step": 356, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1429, + "grad_norm": 1.8125, + "learning_rate": 2.4266609598442496e-05, + "long_answer_loss": 0.1429, + "loss": 0.1608, + "short_answer_loss": NaN, + "step": 357, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1654, + "grad_norm": 1.515625, + "learning_rate": 2.426137659749912e-05, + "long_answer_loss": 0.1654, + "loss": 0.1686, + "short_answer_loss": NaN, + "step": 358, + "template_loss": 0.0 + }, + { + "epoch": 0.27, + "full_loss": 0.1614, + "grad_norm": 1.7578125, + "learning_rate": 2.4256125561523973e-05, + "long_answer_loss": 0.1614, + "loss": 0.1804, + "short_answer_loss": NaN, + "step": 359, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1662, + "grad_norm": 1.5390625, + "learning_rate": 2.425085649856906e-05, + "long_answer_loss": 0.1662, + "loss": 0.1695, + "short_answer_loss": NaN, + "step": 360, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1752, + "grad_norm": 1.640625, + "learning_rate": 2.424556941671402e-05, + "long_answer_loss": 0.1752, + "loss": 0.1774, + "short_answer_loss": NaN, + "step": 361, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1553, + "grad_norm": 1.8046875, + "learning_rate": 2.424026432406612e-05, + "long_answer_loss": 0.1553, + "loss": 0.1713, + "short_answer_loss": NaN, + "step": 362, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1801, + "grad_norm": 1.625, + "learning_rate": 2.423494122876026e-05, + "long_answer_loss": 0.1801, + "loss": 0.1714, + "short_answer_loss": NaN, + "step": 363, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1475, + "grad_norm": 1.6015625, + "learning_rate": 2.422960013895893e-05, + "long_answer_loss": 0.1475, + "loss": 0.1713, + "short_answer_loss": NaN, + "step": 364, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1626, + "grad_norm": 1.578125, + "learning_rate": 2.4224241062852223e-05, + "long_answer_loss": 0.1626, + "loss": 0.1728, + "short_answer_loss": NaN, + "step": 365, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1864, + "grad_norm": 1.5859375, + "learning_rate": 2.421886400865781e-05, + "long_answer_loss": 0.1864, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 366, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1619, + "grad_norm": 1.6796875, + "learning_rate": 2.421346898462092e-05, + "long_answer_loss": 0.1619, + "loss": 0.1722, + "short_answer_loss": NaN, + "step": 367, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1571, + "grad_norm": 1.6328125, + "learning_rate": 2.4208055999014358e-05, + "long_answer_loss": 0.1571, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 368, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1906, + "grad_norm": 1.53125, + "learning_rate": 2.4202625060138448e-05, + "long_answer_loss": 0.1906, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 369, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1708, + "grad_norm": 1.4921875, + "learning_rate": 2.4197176176321062e-05, + "long_answer_loss": 0.1708, + "loss": 0.1736, + "short_answer_loss": NaN, + "step": 370, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1703, + "grad_norm": 1.5078125, + "learning_rate": 2.4191709355917578e-05, + "long_answer_loss": 0.1703, + "loss": 0.1592, + "short_answer_loss": NaN, + "step": 371, + "template_loss": 0.0 + }, + { + "epoch": 0.28, + "full_loss": 0.1814, + "grad_norm": 1.546875, + "learning_rate": 2.4186224607310885e-05, + "long_answer_loss": 0.1814, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 372, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1752, + "grad_norm": 1.578125, + "learning_rate": 2.4180721938911354e-05, + "long_answer_loss": 0.1752, + "loss": 0.175, + "short_answer_loss": NaN, + "step": 373, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1876, + "grad_norm": 1.5, + "learning_rate": 2.417520135915685e-05, + "long_answer_loss": 0.1876, + "loss": 0.1647, + "short_answer_loss": NaN, + "step": 374, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1672, + "grad_norm": 1.625, + "learning_rate": 2.416966287651269e-05, + "long_answer_loss": 0.1672, + "loss": 0.1706, + "short_answer_loss": NaN, + "step": 375, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1832, + "grad_norm": 1.65625, + "learning_rate": 2.4164106499471647e-05, + "long_answer_loss": 0.1832, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 376, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1775, + "grad_norm": 1.609375, + "learning_rate": 2.4158532236553934e-05, + "long_answer_loss": 0.1775, + "loss": 0.167, + "short_answer_loss": NaN, + "step": 377, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1594, + "grad_norm": 1.4921875, + "learning_rate": 2.4152940096307192e-05, + "long_answer_loss": 0.1594, + "loss": 0.1584, + "short_answer_loss": NaN, + "step": 378, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.178, + "grad_norm": 1.5078125, + "learning_rate": 2.4147330087306475e-05, + "long_answer_loss": 0.178, + "loss": 0.1672, + "short_answer_loss": NaN, + "step": 379, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1556, + "grad_norm": 1.5390625, + "learning_rate": 2.4141702218154232e-05, + "long_answer_loss": 0.1556, + "loss": 0.1693, + "short_answer_loss": NaN, + "step": 380, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1987, + "grad_norm": 1.5, + "learning_rate": 2.4136056497480306e-05, + "long_answer_loss": 0.1987, + "loss": 0.1739, + "short_answer_loss": NaN, + "step": 381, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1702, + "grad_norm": 1.65625, + "learning_rate": 2.413039293394191e-05, + "long_answer_loss": 0.1702, + "loss": 0.1722, + "short_answer_loss": NaN, + "step": 382, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1701, + "grad_norm": 1.5546875, + "learning_rate": 2.4124711536223623e-05, + "long_answer_loss": 0.1701, + "loss": 0.1808, + "short_answer_loss": NaN, + "step": 383, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1951, + "grad_norm": 1.5546875, + "learning_rate": 2.4119012313037353e-05, + "long_answer_loss": 0.1951, + "loss": 0.1744, + "short_answer_loss": NaN, + "step": 384, + "template_loss": 0.0 + }, + { + "epoch": 0.29, + "full_loss": 0.1922, + "grad_norm": 1.59375, + "learning_rate": 2.411329527312237e-05, + "long_answer_loss": 0.1922, + "loss": 0.1743, + "short_answer_loss": NaN, + "step": 385, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1705, + "grad_norm": 1.546875, + "learning_rate": 2.4107560425245248e-05, + "long_answer_loss": 0.1705, + "loss": 0.1721, + "short_answer_loss": NaN, + "step": 386, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1606, + "grad_norm": 1.5078125, + "learning_rate": 2.4101807778199858e-05, + "long_answer_loss": 0.1606, + "loss": 0.1718, + "short_answer_loss": NaN, + "step": 387, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1857, + "grad_norm": 1.6484375, + "learning_rate": 2.4096037340807385e-05, + "long_answer_loss": 0.1857, + "loss": 0.1783, + "short_answer_loss": NaN, + "step": 388, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1812, + "grad_norm": 1.5546875, + "learning_rate": 2.4090249121916284e-05, + "long_answer_loss": 0.1812, + "loss": 0.1796, + "short_answer_loss": NaN, + "step": 389, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1745, + "grad_norm": 1.6484375, + "learning_rate": 2.4084443130402274e-05, + "long_answer_loss": 0.1745, + "loss": 0.1636, + "short_answer_loss": NaN, + "step": 390, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1987, + "grad_norm": 1.640625, + "learning_rate": 2.4078619375168333e-05, + "long_answer_loss": 0.1987, + "loss": 0.186, + "short_answer_loss": NaN, + "step": 391, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1688, + "grad_norm": 1.6171875, + "learning_rate": 2.4072777865144678e-05, + "long_answer_loss": 0.1688, + "loss": 0.171, + "short_answer_loss": NaN, + "step": 392, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1596, + "grad_norm": 1.5859375, + "learning_rate": 2.406691860928874e-05, + "long_answer_loss": 0.1596, + "loss": 0.1638, + "short_answer_loss": NaN, + "step": 393, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1692, + "grad_norm": 1.5859375, + "learning_rate": 2.4061041616585177e-05, + "long_answer_loss": 0.1692, + "loss": 0.1742, + "short_answer_loss": NaN, + "step": 394, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1661, + "grad_norm": 1.6015625, + "learning_rate": 2.4055146896045837e-05, + "long_answer_loss": 0.1661, + "loss": 0.1697, + "short_answer_loss": NaN, + "step": 395, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1562, + "grad_norm": 1.515625, + "learning_rate": 2.404923445670975e-05, + "long_answer_loss": 0.1562, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 396, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1623, + "grad_norm": 1.46875, + "learning_rate": 2.404330430764312e-05, + "long_answer_loss": 0.1623, + "loss": 0.1648, + "short_answer_loss": NaN, + "step": 397, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1756, + "grad_norm": 1.5078125, + "learning_rate": 2.4037356457939307e-05, + "long_answer_loss": 0.1756, + "loss": 0.1713, + "short_answer_loss": NaN, + "step": 398, + "template_loss": 0.0 + }, + { + "epoch": 0.3, + "full_loss": 0.1489, + "grad_norm": 1.578125, + "learning_rate": 2.403139091671882e-05, + "long_answer_loss": 0.1489, + "loss": 0.171, + "short_answer_loss": NaN, + "step": 399, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1594, + "grad_norm": 1.4765625, + "learning_rate": 2.4025407693129278e-05, + "long_answer_loss": 0.1594, + "loss": 0.1655, + "short_answer_loss": NaN, + "step": 400, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1474, + "grad_norm": 1.5, + "learning_rate": 2.4019406796345434e-05, + "long_answer_loss": 0.1474, + "loss": 0.1661, + "short_answer_loss": NaN, + "step": 401, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1628, + "grad_norm": 1.609375, + "learning_rate": 2.401338823556913e-05, + "long_answer_loss": 0.1628, + "loss": 0.1789, + "short_answer_loss": NaN, + "step": 402, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1855, + "grad_norm": 1.546875, + "learning_rate": 2.4007352020029292e-05, + "long_answer_loss": 0.1855, + "loss": 0.1672, + "short_answer_loss": NaN, + "step": 403, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1614, + "grad_norm": 1.484375, + "learning_rate": 2.400129815898193e-05, + "long_answer_loss": 0.1614, + "loss": 0.1633, + "short_answer_loss": NaN, + "step": 404, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1755, + "grad_norm": 1.6484375, + "learning_rate": 2.3995226661710105e-05, + "long_answer_loss": 0.1755, + "loss": 0.171, + "short_answer_loss": NaN, + "step": 405, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1746, + "grad_norm": 1.5859375, + "learning_rate": 2.3989137537523922e-05, + "long_answer_loss": 0.1746, + "loss": 0.1729, + "short_answer_loss": NaN, + "step": 406, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1726, + "grad_norm": 1.4375, + "learning_rate": 2.3983030795760504e-05, + "long_answer_loss": 0.1726, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 407, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.153, + "grad_norm": 1.5625, + "learning_rate": 2.3976906445784015e-05, + "long_answer_loss": 0.153, + "loss": 0.1792, + "short_answer_loss": NaN, + "step": 408, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1705, + "grad_norm": 1.5859375, + "learning_rate": 2.3970764496985597e-05, + "long_answer_loss": 0.1705, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 409, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1677, + "grad_norm": 1.484375, + "learning_rate": 2.3964604958783388e-05, + "long_answer_loss": 0.1677, + "loss": 0.1656, + "short_answer_loss": NaN, + "step": 410, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1659, + "grad_norm": 1.5, + "learning_rate": 2.3958427840622495e-05, + "long_answer_loss": 0.1659, + "loss": 0.1671, + "short_answer_loss": NaN, + "step": 411, + "template_loss": 0.0 + }, + { + "epoch": 0.31, + "full_loss": 0.1975, + "grad_norm": 1.5390625, + "learning_rate": 2.3952233151974978e-05, + "long_answer_loss": 0.1975, + "loss": 0.1751, + "short_answer_loss": NaN, + "step": 412, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1731, + "grad_norm": 1.5703125, + "learning_rate": 2.394602090233985e-05, + "long_answer_loss": 0.1731, + "loss": 0.1785, + "short_answer_loss": NaN, + "step": 413, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.154, + "grad_norm": 1.578125, + "learning_rate": 2.393979110124305e-05, + "long_answer_loss": 0.154, + "loss": 0.1602, + "short_answer_loss": NaN, + "step": 414, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1741, + "grad_norm": 1.59375, + "learning_rate": 2.3933543758237418e-05, + "long_answer_loss": 0.1741, + "loss": 0.176, + "short_answer_loss": NaN, + "step": 415, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1838, + "grad_norm": 1.46875, + "learning_rate": 2.392727888290271e-05, + "long_answer_loss": 0.1838, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 416, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1626, + "grad_norm": 1.546875, + "learning_rate": 2.3920996484845558e-05, + "long_answer_loss": 0.1626, + "loss": 0.1733, + "short_answer_loss": NaN, + "step": 417, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1627, + "grad_norm": 1.59375, + "learning_rate": 2.391469657369946e-05, + "long_answer_loss": 0.1627, + "loss": 0.1747, + "short_answer_loss": NaN, + "step": 418, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1681, + "grad_norm": 1.515625, + "learning_rate": 2.3908379159124777e-05, + "long_answer_loss": 0.1681, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 419, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1527, + "grad_norm": 1.5546875, + "learning_rate": 2.3902044250808705e-05, + "long_answer_loss": 0.1527, + "loss": 0.1772, + "short_answer_loss": NaN, + "step": 420, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1756, + "grad_norm": 1.5703125, + "learning_rate": 2.3895691858465267e-05, + "long_answer_loss": 0.1756, + "loss": 0.1732, + "short_answer_loss": NaN, + "step": 421, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1576, + "grad_norm": 1.4765625, + "learning_rate": 2.3889321991835296e-05, + "long_answer_loss": 0.1576, + "loss": 0.1627, + "short_answer_loss": NaN, + "step": 422, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1607, + "grad_norm": 1.6953125, + "learning_rate": 2.3882934660686418e-05, + "long_answer_loss": 0.1607, + "loss": 0.1717, + "short_answer_loss": NaN, + "step": 423, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1931, + "grad_norm": 1.5078125, + "learning_rate": 2.3876529874813036e-05, + "long_answer_loss": 0.1931, + "loss": 0.182, + "short_answer_loss": NaN, + "step": 424, + "template_loss": 0.0 + }, + { + "epoch": 0.32, + "full_loss": 0.1765, + "grad_norm": 1.6484375, + "learning_rate": 2.3870107644036334e-05, + "long_answer_loss": 0.1765, + "loss": 0.1709, + "short_answer_loss": NaN, + "step": 425, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.2063, + "grad_norm": 1.53125, + "learning_rate": 2.3863667978204225e-05, + "long_answer_loss": 0.2063, + "loss": 0.1742, + "short_answer_loss": NaN, + "step": 426, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1628, + "grad_norm": 1.484375, + "learning_rate": 2.385721088719138e-05, + "long_answer_loss": 0.1628, + "loss": 0.1699, + "short_answer_loss": NaN, + "step": 427, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1639, + "grad_norm": 1.5703125, + "learning_rate": 2.385073638089916e-05, + "long_answer_loss": 0.1639, + "loss": 0.1723, + "short_answer_loss": NaN, + "step": 428, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1981, + "grad_norm": 1.375, + "learning_rate": 2.3844244469255665e-05, + "long_answer_loss": 0.1981, + "loss": 0.1602, + "short_answer_loss": NaN, + "step": 429, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1775, + "grad_norm": 1.59375, + "learning_rate": 2.383773516221566e-05, + "long_answer_loss": 0.1775, + "loss": 0.1674, + "short_answer_loss": NaN, + "step": 430, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1972, + "grad_norm": 1.5625, + "learning_rate": 2.3831208469760588e-05, + "long_answer_loss": 0.1972, + "loss": 0.1734, + "short_answer_loss": NaN, + "step": 431, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1634, + "grad_norm": 1.5625, + "learning_rate": 2.3824664401898564e-05, + "long_answer_loss": 0.1634, + "loss": 0.1686, + "short_answer_loss": NaN, + "step": 432, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1852, + "grad_norm": 1.65625, + "learning_rate": 2.3818102968664334e-05, + "long_answer_loss": 0.1852, + "loss": 0.1734, + "short_answer_loss": NaN, + "step": 433, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.164, + "grad_norm": 1.5703125, + "learning_rate": 2.3811524180119276e-05, + "long_answer_loss": 0.164, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 434, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1565, + "grad_norm": 1.5, + "learning_rate": 2.3804928046351384e-05, + "long_answer_loss": 0.1565, + "loss": 0.1663, + "short_answer_loss": NaN, + "step": 435, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1741, + "grad_norm": 1.5859375, + "learning_rate": 2.379831457747524e-05, + "long_answer_loss": 0.1741, + "loss": 0.1718, + "short_answer_loss": NaN, + "step": 436, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1936, + "grad_norm": 1.640625, + "learning_rate": 2.3791683783632018e-05, + "long_answer_loss": 0.1936, + "loss": 0.1797, + "short_answer_loss": NaN, + "step": 437, + "template_loss": 0.0 + }, + { + "epoch": 0.33, + "full_loss": 0.1689, + "grad_norm": 1.515625, + "learning_rate": 2.3785035674989452e-05, + "long_answer_loss": 0.1689, + "loss": 0.1723, + "short_answer_loss": NaN, + "step": 438, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1903, + "grad_norm": 1.671875, + "learning_rate": 2.3778370261741834e-05, + "long_answer_loss": 0.1903, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 439, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1722, + "grad_norm": 1.515625, + "learning_rate": 2.3771687554109983e-05, + "long_answer_loss": 0.1722, + "loss": 0.1616, + "short_answer_loss": NaN, + "step": 440, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1647, + "grad_norm": 1.4609375, + "learning_rate": 2.376498756234124e-05, + "long_answer_loss": 0.1647, + "loss": 0.157, + "short_answer_loss": NaN, + "step": 441, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1953, + "grad_norm": 1.671875, + "learning_rate": 2.3758270296709455e-05, + "long_answer_loss": 0.1953, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 442, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1512, + "grad_norm": 1.609375, + "learning_rate": 2.3751535767514955e-05, + "long_answer_loss": 0.1512, + "loss": 0.1677, + "short_answer_loss": NaN, + "step": 443, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1621, + "grad_norm": 1.6796875, + "learning_rate": 2.374478398508455e-05, + "long_answer_loss": 0.1621, + "loss": 0.1739, + "short_answer_loss": NaN, + "step": 444, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.144, + "grad_norm": 1.5078125, + "learning_rate": 2.3738014959771498e-05, + "long_answer_loss": 0.144, + "loss": 0.1536, + "short_answer_loss": NaN, + "step": 445, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.17, + "grad_norm": 1.4296875, + "learning_rate": 2.3731228701955506e-05, + "long_answer_loss": 0.17, + "loss": 0.1623, + "short_answer_loss": NaN, + "step": 446, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1483, + "grad_norm": 1.4921875, + "learning_rate": 2.3724425222042692e-05, + "long_answer_loss": 0.1483, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 447, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1925, + "grad_norm": 1.53125, + "learning_rate": 2.3717604530465604e-05, + "long_answer_loss": 0.1925, + "loss": 0.1725, + "short_answer_loss": NaN, + "step": 448, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1583, + "grad_norm": 1.5078125, + "learning_rate": 2.3710766637683158e-05, + "long_answer_loss": 0.1583, + "loss": 0.1693, + "short_answer_loss": NaN, + "step": 449, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1951, + "grad_norm": 1.4375, + "learning_rate": 2.3703911554180666e-05, + "long_answer_loss": 0.1951, + "loss": 0.1755, + "short_answer_loss": NaN, + "step": 450, + "template_loss": 0.0 + }, + { + "epoch": 0.34, + "full_loss": 0.1633, + "grad_norm": 1.390625, + "learning_rate": 2.369703929046979e-05, + "long_answer_loss": 0.1633, + "loss": 0.1632, + "short_answer_loss": NaN, + "step": 451, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1814, + "grad_norm": 1.4921875, + "learning_rate": 2.369014985708854e-05, + "long_answer_loss": 0.1814, + "loss": 0.1706, + "short_answer_loss": NaN, + "step": 452, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.178, + "grad_norm": 1.53125, + "learning_rate": 2.3683243264601253e-05, + "long_answer_loss": 0.178, + "loss": 0.1762, + "short_answer_loss": NaN, + "step": 453, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1773, + "grad_norm": 1.578125, + "learning_rate": 2.3676319523598577e-05, + "long_answer_loss": 0.1773, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 454, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1677, + "grad_norm": 1.4453125, + "learning_rate": 2.366937864469746e-05, + "long_answer_loss": 0.1677, + "loss": 0.1714, + "short_answer_loss": NaN, + "step": 455, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1799, + "grad_norm": 1.40625, + "learning_rate": 2.366242063854112e-05, + "long_answer_loss": 0.1799, + "loss": 0.1653, + "short_answer_loss": NaN, + "step": 456, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1721, + "grad_norm": 1.515625, + "learning_rate": 2.3655445515799053e-05, + "long_answer_loss": 0.1721, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 457, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1854, + "grad_norm": 1.4921875, + "learning_rate": 2.364845328716699e-05, + "long_answer_loss": 0.1854, + "loss": 0.1654, + "short_answer_loss": NaN, + "step": 458, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1768, + "grad_norm": 1.546875, + "learning_rate": 2.3641443963366893e-05, + "long_answer_loss": 0.1768, + "loss": 0.1682, + "short_answer_loss": NaN, + "step": 459, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1397, + "grad_norm": 1.6171875, + "learning_rate": 2.3634417555146944e-05, + "long_answer_loss": 0.1397, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 460, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1699, + "grad_norm": 1.4921875, + "learning_rate": 2.3627374073281522e-05, + "long_answer_loss": 0.1699, + "loss": 0.1669, + "short_answer_loss": NaN, + "step": 461, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1854, + "grad_norm": 1.453125, + "learning_rate": 2.3620313528571175e-05, + "long_answer_loss": 0.1854, + "loss": 0.1676, + "short_answer_loss": NaN, + "step": 462, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1752, + "grad_norm": 1.546875, + "learning_rate": 2.361323593184263e-05, + "long_answer_loss": 0.1752, + "loss": 0.1767, + "short_answer_loss": NaN, + "step": 463, + "template_loss": 0.0 + }, + { + "epoch": 0.35, + "full_loss": 0.1727, + "grad_norm": 1.421875, + "learning_rate": 2.360614129394876e-05, + "long_answer_loss": 0.1727, + "loss": 0.1593, + "short_answer_loss": NaN, + "step": 464, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1874, + "grad_norm": 1.515625, + "learning_rate": 2.359902962576856e-05, + "long_answer_loss": 0.1874, + "loss": 0.1697, + "short_answer_loss": NaN, + "step": 465, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.161, + "grad_norm": 1.4296875, + "learning_rate": 2.3591900938207147e-05, + "long_answer_loss": 0.161, + "loss": 0.1741, + "short_answer_loss": NaN, + "step": 466, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1667, + "grad_norm": 1.53125, + "learning_rate": 2.358475524219573e-05, + "long_answer_loss": 0.1667, + "loss": 0.1749, + "short_answer_loss": NaN, + "step": 467, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1708, + "grad_norm": 1.4921875, + "learning_rate": 2.3577592548691606e-05, + "long_answer_loss": 0.1708, + "loss": 0.1739, + "short_answer_loss": NaN, + "step": 468, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1475, + "grad_norm": 1.6015625, + "learning_rate": 2.3570412868678132e-05, + "long_answer_loss": 0.1475, + "loss": 0.1611, + "short_answer_loss": NaN, + "step": 469, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.2036, + "grad_norm": 1.515625, + "learning_rate": 2.3563216213164713e-05, + "long_answer_loss": 0.2036, + "loss": 0.1762, + "short_answer_loss": NaN, + "step": 470, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1634, + "grad_norm": 1.421875, + "learning_rate": 2.3556002593186783e-05, + "long_answer_loss": 0.1634, + "loss": 0.1632, + "short_answer_loss": NaN, + "step": 471, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1565, + "grad_norm": 1.5859375, + "learning_rate": 2.3548772019805793e-05, + "long_answer_loss": 0.1565, + "loss": 0.1654, + "short_answer_loss": NaN, + "step": 472, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1365, + "grad_norm": 1.515625, + "learning_rate": 2.3541524504109182e-05, + "long_answer_loss": 0.1365, + "loss": 0.1658, + "short_answer_loss": NaN, + "step": 473, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1582, + "grad_norm": 1.4453125, + "learning_rate": 2.3534260057210384e-05, + "long_answer_loss": 0.1582, + "loss": 0.1704, + "short_answer_loss": NaN, + "step": 474, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.2011, + "grad_norm": 1.4375, + "learning_rate": 2.3526978690248782e-05, + "long_answer_loss": 0.2011, + "loss": 0.1601, + "short_answer_loss": NaN, + "step": 475, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1821, + "grad_norm": 1.5078125, + "learning_rate": 2.351968041438971e-05, + "long_answer_loss": 0.1821, + "loss": 0.1651, + "short_answer_loss": NaN, + "step": 476, + "template_loss": 0.0 + }, + { + "epoch": 0.36, + "full_loss": 0.1619, + "grad_norm": 1.4453125, + "learning_rate": 2.3512365240824426e-05, + "long_answer_loss": 0.1619, + "loss": 0.1641, + "short_answer_loss": NaN, + "step": 477, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1637, + "grad_norm": 1.4296875, + "learning_rate": 2.350503318077011e-05, + "long_answer_loss": 0.1637, + "loss": 0.1589, + "short_answer_loss": NaN, + "step": 478, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.2126, + "grad_norm": 1.734375, + "learning_rate": 2.3497684245469816e-05, + "long_answer_loss": 0.2126, + "loss": 0.1755, + "short_answer_loss": NaN, + "step": 479, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1556, + "grad_norm": 1.421875, + "learning_rate": 2.3490318446192498e-05, + "long_answer_loss": 0.1556, + "loss": 0.1589, + "short_answer_loss": NaN, + "step": 480, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1729, + "grad_norm": 1.5, + "learning_rate": 2.3482935794232953e-05, + "long_answer_loss": 0.1729, + "loss": 0.1698, + "short_answer_loss": NaN, + "step": 481, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1569, + "grad_norm": 1.515625, + "learning_rate": 2.3475536300911827e-05, + "long_answer_loss": 0.1569, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 482, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1344, + "grad_norm": 1.4765625, + "learning_rate": 2.346811997757559e-05, + "long_answer_loss": 0.1344, + "loss": 0.168, + "short_answer_loss": NaN, + "step": 483, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1579, + "grad_norm": 1.4921875, + "learning_rate": 2.3460686835596514e-05, + "long_answer_loss": 0.1579, + "loss": 0.1681, + "short_answer_loss": NaN, + "step": 484, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1907, + "grad_norm": 1.515625, + "learning_rate": 2.345323688637267e-05, + "long_answer_loss": 0.1907, + "loss": 0.1705, + "short_answer_loss": NaN, + "step": 485, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1715, + "grad_norm": 1.546875, + "learning_rate": 2.34457701413279e-05, + "long_answer_loss": 0.1715, + "loss": 0.1613, + "short_answer_loss": NaN, + "step": 486, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1533, + "grad_norm": 1.3984375, + "learning_rate": 2.3438286611911787e-05, + "long_answer_loss": 0.1533, + "loss": 0.1624, + "short_answer_loss": NaN, + "step": 487, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1697, + "grad_norm": 1.5703125, + "learning_rate": 2.3430786309599674e-05, + "long_answer_loss": 0.1697, + "loss": 0.1659, + "short_answer_loss": NaN, + "step": 488, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1823, + "grad_norm": 1.6328125, + "learning_rate": 2.3423269245892602e-05, + "long_answer_loss": 0.1823, + "loss": 0.1716, + "short_answer_loss": NaN, + "step": 489, + "template_loss": 0.0 + }, + { + "epoch": 0.37, + "full_loss": 0.1861, + "grad_norm": 1.5625, + "learning_rate": 2.3415735432317328e-05, + "long_answer_loss": 0.1861, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 490, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1726, + "grad_norm": 1.3984375, + "learning_rate": 2.3408184880426293e-05, + "long_answer_loss": 0.1726, + "loss": 0.1685, + "short_answer_loss": NaN, + "step": 491, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1605, + "grad_norm": 1.46875, + "learning_rate": 2.3400617601797597e-05, + "long_answer_loss": 0.1605, + "loss": 0.1666, + "short_answer_loss": NaN, + "step": 492, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1579, + "grad_norm": 1.609375, + "learning_rate": 2.3393033608034993e-05, + "long_answer_loss": 0.1579, + "loss": 0.164, + "short_answer_loss": NaN, + "step": 493, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1729, + "grad_norm": 1.4765625, + "learning_rate": 2.338543291076787e-05, + "long_answer_loss": 0.1729, + "loss": 0.1662, + "short_answer_loss": NaN, + "step": 494, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1626, + "grad_norm": 1.515625, + "learning_rate": 2.3377815521651213e-05, + "long_answer_loss": 0.1626, + "loss": 0.1745, + "short_answer_loss": NaN, + "step": 495, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1797, + "grad_norm": 1.6015625, + "learning_rate": 2.3370181452365633e-05, + "long_answer_loss": 0.1797, + "loss": 0.175, + "short_answer_loss": NaN, + "step": 496, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1468, + "grad_norm": 1.453125, + "learning_rate": 2.3362530714617287e-05, + "long_answer_loss": 0.1468, + "loss": 0.1721, + "short_answer_loss": NaN, + "step": 497, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1568, + "grad_norm": 1.4765625, + "learning_rate": 2.3354863320137916e-05, + "long_answer_loss": 0.1568, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 498, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1422, + "grad_norm": 1.5390625, + "learning_rate": 2.3347179280684782e-05, + "long_answer_loss": 0.1422, + "loss": 0.1718, + "short_answer_loss": NaN, + "step": 499, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1559, + "grad_norm": 1.546875, + "learning_rate": 2.3339478608040682e-05, + "long_answer_loss": 0.1559, + "loss": 0.1763, + "short_answer_loss": NaN, + "step": 500, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1621, + "grad_norm": 1.484375, + "learning_rate": 2.3331761314013924e-05, + "long_answer_loss": 0.1621, + "loss": 0.165, + "short_answer_loss": NaN, + "step": 501, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1939, + "grad_norm": 1.5390625, + "learning_rate": 2.3324027410438288e-05, + "long_answer_loss": 0.1939, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 502, + "template_loss": 0.0 + }, + { + "epoch": 0.38, + "full_loss": 0.1553, + "grad_norm": 1.40625, + "learning_rate": 2.331627690917304e-05, + "long_answer_loss": 0.1553, + "loss": 0.1684, + "short_answer_loss": NaN, + "step": 503, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1612, + "grad_norm": 1.421875, + "learning_rate": 2.3308509822102884e-05, + "long_answer_loss": 0.1612, + "loss": 0.1664, + "short_answer_loss": NaN, + "step": 504, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1765, + "grad_norm": 1.578125, + "learning_rate": 2.330072616113796e-05, + "long_answer_loss": 0.1765, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 505, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1581, + "grad_norm": 1.515625, + "learning_rate": 2.329292593821383e-05, + "long_answer_loss": 0.1581, + "loss": 0.1593, + "short_answer_loss": NaN, + "step": 506, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1539, + "grad_norm": 1.4296875, + "learning_rate": 2.3285109165291442e-05, + "long_answer_loss": 0.1539, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 507, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1727, + "grad_norm": 1.515625, + "learning_rate": 2.327727585435713e-05, + "long_answer_loss": 0.1727, + "loss": 0.1705, + "short_answer_loss": NaN, + "step": 508, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1661, + "grad_norm": 1.5546875, + "learning_rate": 2.3269426017422576e-05, + "long_answer_loss": 0.1661, + "loss": 0.1679, + "short_answer_loss": NaN, + "step": 509, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1664, + "grad_norm": 1.3984375, + "learning_rate": 2.3261559666524824e-05, + "long_answer_loss": 0.1664, + "loss": 0.16, + "short_answer_loss": NaN, + "step": 510, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1819, + "grad_norm": 1.59375, + "learning_rate": 2.3253676813726218e-05, + "long_answer_loss": 0.1819, + "loss": 0.1691, + "short_answer_loss": NaN, + "step": 511, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1431, + "grad_norm": 1.4453125, + "learning_rate": 2.324577747111442e-05, + "long_answer_loss": 0.1431, + "loss": 0.1634, + "short_answer_loss": NaN, + "step": 512, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.146, + "grad_norm": 1.484375, + "learning_rate": 2.323786165080238e-05, + "long_answer_loss": 0.146, + "loss": 0.1679, + "short_answer_loss": NaN, + "step": 513, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.172, + "grad_norm": 1.6171875, + "learning_rate": 2.3229929364928294e-05, + "long_answer_loss": 0.172, + "loss": 0.173, + "short_answer_loss": NaN, + "step": 514, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1789, + "grad_norm": 1.484375, + "learning_rate": 2.3221980625655632e-05, + "long_answer_loss": 0.1789, + "loss": 0.1587, + "short_answer_loss": NaN, + "step": 515, + "template_loss": 0.0 + }, + { + "epoch": 0.39, + "full_loss": 0.1505, + "grad_norm": 1.46875, + "learning_rate": 2.3214015445173083e-05, + "long_answer_loss": 0.1505, + "loss": 0.1606, + "short_answer_loss": NaN, + "step": 516, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1465, + "grad_norm": 1.4609375, + "learning_rate": 2.3206033835694545e-05, + "long_answer_loss": 0.1465, + "loss": 0.163, + "short_answer_loss": NaN, + "step": 517, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1578, + "grad_norm": 1.5390625, + "learning_rate": 2.3198035809459114e-05, + "long_answer_loss": 0.1578, + "loss": 0.1711, + "short_answer_loss": NaN, + "step": 518, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1532, + "grad_norm": 1.453125, + "learning_rate": 2.3190021378731054e-05, + "long_answer_loss": 0.1532, + "loss": 0.156, + "short_answer_loss": NaN, + "step": 519, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1936, + "grad_norm": 1.5078125, + "learning_rate": 2.3181990555799786e-05, + "long_answer_loss": 0.1936, + "loss": 0.1668, + "short_answer_loss": NaN, + "step": 520, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1764, + "grad_norm": 1.78125, + "learning_rate": 2.3173943352979865e-05, + "long_answer_loss": 0.1764, + "loss": 0.1717, + "short_answer_loss": NaN, + "step": 521, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1808, + "grad_norm": 1.5859375, + "learning_rate": 2.3165879782610973e-05, + "long_answer_loss": 0.1808, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 522, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1686, + "grad_norm": 1.546875, + "learning_rate": 2.3157799857057878e-05, + "long_answer_loss": 0.1686, + "loss": 0.1627, + "short_answer_loss": NaN, + "step": 523, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1803, + "grad_norm": 1.5234375, + "learning_rate": 2.314970358871043e-05, + "long_answer_loss": 0.1803, + "loss": 0.1723, + "short_answer_loss": NaN, + "step": 524, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1508, + "grad_norm": 1.46875, + "learning_rate": 2.314159098998354e-05, + "long_answer_loss": 0.1508, + "loss": 0.1642, + "short_answer_loss": NaN, + "step": 525, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1954, + "grad_norm": 1.625, + "learning_rate": 2.3133462073317174e-05, + "long_answer_loss": 0.1954, + "loss": 0.1791, + "short_answer_loss": NaN, + "step": 526, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1637, + "grad_norm": 1.4921875, + "learning_rate": 2.3125316851176288e-05, + "long_answer_loss": 0.1637, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 527, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1809, + "grad_norm": 1.5859375, + "learning_rate": 2.3117155336050875e-05, + "long_answer_loss": 0.1809, + "loss": 0.1741, + "short_answer_loss": NaN, + "step": 528, + "template_loss": 0.0 + }, + { + "epoch": 0.4, + "full_loss": 0.1594, + "grad_norm": 1.5, + "learning_rate": 2.3108977540455893e-05, + "long_answer_loss": 0.1594, + "loss": 0.1669, + "short_answer_loss": NaN, + "step": 529, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.204, + "grad_norm": 1.4375, + "learning_rate": 2.3100783476931267e-05, + "long_answer_loss": 0.204, + "loss": 0.1645, + "short_answer_loss": NaN, + "step": 530, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1508, + "grad_norm": 1.421875, + "learning_rate": 2.3092573158041873e-05, + "long_answer_loss": 0.1508, + "loss": 0.1678, + "short_answer_loss": NaN, + "step": 531, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1789, + "grad_norm": 1.484375, + "learning_rate": 2.3084346596377505e-05, + "long_answer_loss": 0.1789, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 532, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1675, + "grad_norm": 1.5546875, + "learning_rate": 2.3076103804552872e-05, + "long_answer_loss": 0.1675, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 533, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1722, + "grad_norm": 1.53125, + "learning_rate": 2.3067844795207565e-05, + "long_answer_loss": 0.1722, + "loss": 0.1775, + "short_answer_loss": NaN, + "step": 534, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1715, + "grad_norm": 1.6484375, + "learning_rate": 2.305956958100605e-05, + "long_answer_loss": 0.1715, + "loss": 0.1688, + "short_answer_loss": NaN, + "step": 535, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.168, + "grad_norm": 1.6015625, + "learning_rate": 2.305127817463763e-05, + "long_answer_loss": 0.168, + "loss": 0.1673, + "short_answer_loss": NaN, + "step": 536, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1516, + "grad_norm": 1.5703125, + "learning_rate": 2.3042970588816445e-05, + "long_answer_loss": 0.1516, + "loss": 0.1723, + "short_answer_loss": NaN, + "step": 537, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1507, + "grad_norm": 1.453125, + "learning_rate": 2.3034646836281447e-05, + "long_answer_loss": 0.1507, + "loss": 0.1544, + "short_answer_loss": NaN, + "step": 538, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1509, + "grad_norm": 1.609375, + "learning_rate": 2.3026306929796374e-05, + "long_answer_loss": 0.1509, + "loss": 0.1665, + "short_answer_loss": NaN, + "step": 539, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1362, + "grad_norm": 1.5703125, + "learning_rate": 2.3017950882149736e-05, + "long_answer_loss": 0.1362, + "loss": 0.1747, + "short_answer_loss": NaN, + "step": 540, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1778, + "grad_norm": 1.5859375, + "learning_rate": 2.3009578706154787e-05, + "long_answer_loss": 0.1778, + "loss": 0.1722, + "short_answer_loss": NaN, + "step": 541, + "template_loss": 0.0 + }, + { + "epoch": 0.41, + "full_loss": 0.1641, + "grad_norm": 1.5546875, + "learning_rate": 2.300119041464953e-05, + "long_answer_loss": 0.1641, + "loss": 0.1689, + "short_answer_loss": NaN, + "step": 542, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1651, + "grad_norm": 1.5234375, + "learning_rate": 2.2992786020496665e-05, + "long_answer_loss": 0.1651, + "loss": 0.159, + "short_answer_loss": NaN, + "step": 543, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1431, + "grad_norm": 1.5234375, + "learning_rate": 2.2984365536583585e-05, + "long_answer_loss": 0.1431, + "loss": 0.1627, + "short_answer_loss": NaN, + "step": 544, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1724, + "grad_norm": 1.4453125, + "learning_rate": 2.2975928975822363e-05, + "long_answer_loss": 0.1724, + "loss": 0.1637, + "short_answer_loss": NaN, + "step": 545, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1353, + "grad_norm": 1.3984375, + "learning_rate": 2.2967476351149713e-05, + "long_answer_loss": 0.1353, + "loss": 0.1562, + "short_answer_loss": NaN, + "step": 546, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1846, + "grad_norm": 1.453125, + "learning_rate": 2.2959007675526987e-05, + "long_answer_loss": 0.1846, + "loss": 0.1698, + "short_answer_loss": NaN, + "step": 547, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1545, + "grad_norm": 1.484375, + "learning_rate": 2.2950522961940163e-05, + "long_answer_loss": 0.1545, + "loss": 0.166, + "short_answer_loss": NaN, + "step": 548, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1724, + "grad_norm": 1.46875, + "learning_rate": 2.2942022223399788e-05, + "long_answer_loss": 0.1724, + "loss": 0.1688, + "short_answer_loss": NaN, + "step": 549, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1635, + "grad_norm": 1.46875, + "learning_rate": 2.2933505472940995e-05, + "long_answer_loss": 0.1635, + "loss": 0.158, + "short_answer_loss": NaN, + "step": 550, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1854, + "grad_norm": 1.6875, + "learning_rate": 2.2924972723623474e-05, + "long_answer_loss": 0.1854, + "loss": 0.1712, + "short_answer_loss": NaN, + "step": 551, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1738, + "grad_norm": 1.5546875, + "learning_rate": 2.2916423988531437e-05, + "long_answer_loss": 0.1738, + "loss": 0.1655, + "short_answer_loss": NaN, + "step": 552, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1479, + "grad_norm": 1.4453125, + "learning_rate": 2.2907859280773617e-05, + "long_answer_loss": 0.1479, + "loss": 0.1625, + "short_answer_loss": NaN, + "step": 553, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.161, + "grad_norm": 1.4140625, + "learning_rate": 2.2899278613483232e-05, + "long_answer_loss": 0.161, + "loss": 0.1561, + "short_answer_loss": NaN, + "step": 554, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1328, + "grad_norm": 1.5078125, + "learning_rate": 2.289068199981798e-05, + "long_answer_loss": 0.1328, + "loss": 0.1561, + "short_answer_loss": NaN, + "step": 555, + "template_loss": 0.0 + }, + { + "epoch": 0.42, + "full_loss": 0.1908, + "grad_norm": 1.4140625, + "learning_rate": 2.288206945296001e-05, + "long_answer_loss": 0.1908, + "loss": 0.1616, + "short_answer_loss": NaN, + "step": 556, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1642, + "grad_norm": 1.53125, + "learning_rate": 2.2873440986115903e-05, + "long_answer_loss": 0.1642, + "loss": 0.1605, + "short_answer_loss": NaN, + "step": 557, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.181, + "grad_norm": 1.546875, + "learning_rate": 2.2864796612516644e-05, + "long_answer_loss": 0.181, + "loss": 0.1652, + "short_answer_loss": NaN, + "step": 558, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1668, + "grad_norm": 1.46875, + "learning_rate": 2.2856136345417618e-05, + "long_answer_loss": 0.1668, + "loss": 0.1721, + "short_answer_loss": NaN, + "step": 559, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1369, + "grad_norm": 1.515625, + "learning_rate": 2.2847460198098585e-05, + "long_answer_loss": 0.1369, + "loss": 0.1557, + "short_answer_loss": NaN, + "step": 560, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1605, + "grad_norm": 1.46875, + "learning_rate": 2.2838768183863644e-05, + "long_answer_loss": 0.1605, + "loss": 0.1621, + "short_answer_loss": NaN, + "step": 561, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.15, + "grad_norm": 1.4375, + "learning_rate": 2.283006031604123e-05, + "long_answer_loss": 0.15, + "loss": 0.1575, + "short_answer_loss": NaN, + "step": 562, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.177, + "grad_norm": 1.6328125, + "learning_rate": 2.2821336607984095e-05, + "long_answer_loss": 0.177, + "loss": 0.1659, + "short_answer_loss": NaN, + "step": 563, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1767, + "grad_norm": 1.5234375, + "learning_rate": 2.2812597073069274e-05, + "long_answer_loss": 0.1767, + "loss": 0.1588, + "short_answer_loss": NaN, + "step": 564, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1729, + "grad_norm": 1.546875, + "learning_rate": 2.2803841724698065e-05, + "long_answer_loss": 0.1729, + "loss": 0.1653, + "short_answer_loss": NaN, + "step": 565, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1453, + "grad_norm": 1.390625, + "learning_rate": 2.279507057629603e-05, + "long_answer_loss": 0.1453, + "loss": 0.165, + "short_answer_loss": NaN, + "step": 566, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1633, + "grad_norm": 1.5078125, + "learning_rate": 2.278628364131294e-05, + "long_answer_loss": 0.1633, + "loss": 0.1712, + "short_answer_loss": NaN, + "step": 567, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1726, + "grad_norm": 1.5390625, + "learning_rate": 2.277748093322279e-05, + "long_answer_loss": 0.1726, + "loss": 0.1663, + "short_answer_loss": NaN, + "step": 568, + "template_loss": 0.0 + }, + { + "epoch": 0.43, + "full_loss": 0.1551, + "grad_norm": 1.4375, + "learning_rate": 2.2768662465523755e-05, + "long_answer_loss": 0.1551, + "loss": 0.1603, + "short_answer_loss": NaN, + "step": 569, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1701, + "grad_norm": 1.390625, + "learning_rate": 2.275982825173817e-05, + "long_answer_loss": 0.1701, + "loss": 0.1638, + "short_answer_loss": NaN, + "step": 570, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1825, + "grad_norm": 1.515625, + "learning_rate": 2.2750978305412528e-05, + "long_answer_loss": 0.1825, + "loss": 0.1686, + "short_answer_loss": NaN, + "step": 571, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1553, + "grad_norm": 1.5390625, + "learning_rate": 2.274211264011744e-05, + "long_answer_loss": 0.1553, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 572, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.181, + "grad_norm": 1.609375, + "learning_rate": 2.273323126944762e-05, + "long_answer_loss": 0.181, + "loss": 0.1664, + "short_answer_loss": NaN, + "step": 573, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1669, + "grad_norm": 1.59375, + "learning_rate": 2.2724334207021857e-05, + "long_answer_loss": 0.1669, + "loss": 0.1657, + "short_answer_loss": NaN, + "step": 574, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1474, + "grad_norm": 1.5390625, + "learning_rate": 2.271542146648302e-05, + "long_answer_loss": 0.1474, + "loss": 0.1571, + "short_answer_loss": NaN, + "step": 575, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.168, + "grad_norm": 1.484375, + "learning_rate": 2.2706493061498e-05, + "long_answer_loss": 0.168, + "loss": 0.1579, + "short_answer_loss": NaN, + "step": 576, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1424, + "grad_norm": 1.59375, + "learning_rate": 2.2697549005757728e-05, + "long_answer_loss": 0.1424, + "loss": 0.16, + "short_answer_loss": NaN, + "step": 577, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1731, + "grad_norm": 1.46875, + "learning_rate": 2.2688589312977117e-05, + "long_answer_loss": 0.1731, + "loss": 0.1612, + "short_answer_loss": NaN, + "step": 578, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1888, + "grad_norm": 1.484375, + "learning_rate": 2.267961399689506e-05, + "long_answer_loss": 0.1888, + "loss": 0.1614, + "short_answer_loss": NaN, + "step": 579, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1405, + "grad_norm": 1.6875, + "learning_rate": 2.2670623071274423e-05, + "long_answer_loss": 0.1405, + "loss": 0.1643, + "short_answer_loss": NaN, + "step": 580, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1466, + "grad_norm": 1.4609375, + "learning_rate": 2.2661616549901982e-05, + "long_answer_loss": 0.1466, + "loss": 0.1608, + "short_answer_loss": NaN, + "step": 581, + "template_loss": 0.0 + }, + { + "epoch": 0.44, + "full_loss": 0.1548, + "grad_norm": 1.3984375, + "learning_rate": 2.2652594446588456e-05, + "long_answer_loss": 0.1548, + "loss": 0.1601, + "short_answer_loss": NaN, + "step": 582, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1799, + "grad_norm": 1.5546875, + "learning_rate": 2.264355677516843e-05, + "long_answer_loss": 0.1799, + "loss": 0.1651, + "short_answer_loss": NaN, + "step": 583, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1587, + "grad_norm": 1.453125, + "learning_rate": 2.263450354950038e-05, + "long_answer_loss": 0.1587, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 584, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1888, + "grad_norm": 1.53125, + "learning_rate": 2.262543478346663e-05, + "long_answer_loss": 0.1888, + "loss": 0.17, + "short_answer_loss": NaN, + "step": 585, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1487, + "grad_norm": 1.546875, + "learning_rate": 2.2616350490973326e-05, + "long_answer_loss": 0.1487, + "loss": 0.1602, + "short_answer_loss": NaN, + "step": 586, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1764, + "grad_norm": 1.5, + "learning_rate": 2.2607250685950435e-05, + "long_answer_loss": 0.1764, + "loss": 0.1652, + "short_answer_loss": NaN, + "step": 587, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1787, + "grad_norm": 1.515625, + "learning_rate": 2.2598135382351698e-05, + "long_answer_loss": 0.1787, + "loss": 0.1636, + "short_answer_loss": NaN, + "step": 588, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1398, + "grad_norm": 1.625, + "learning_rate": 2.2589004594154633e-05, + "long_answer_loss": 0.1398, + "loss": 0.1544, + "short_answer_loss": NaN, + "step": 589, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1665, + "grad_norm": 1.453125, + "learning_rate": 2.2579858335360492e-05, + "long_answer_loss": 0.1665, + "loss": 0.168, + "short_answer_loss": NaN, + "step": 590, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1713, + "grad_norm": 1.5, + "learning_rate": 2.2570696619994253e-05, + "long_answer_loss": 0.1713, + "loss": 0.1672, + "short_answer_loss": NaN, + "step": 591, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1436, + "grad_norm": 1.421875, + "learning_rate": 2.2561519462104604e-05, + "long_answer_loss": 0.1436, + "loss": 0.16, + "short_answer_loss": NaN, + "step": 592, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1525, + "grad_norm": 1.53125, + "learning_rate": 2.25523268757639e-05, + "long_answer_loss": 0.1525, + "loss": 0.1584, + "short_answer_loss": NaN, + "step": 593, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1666, + "grad_norm": 1.40625, + "learning_rate": 2.2543118875068166e-05, + "long_answer_loss": 0.1666, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 594, + "template_loss": 0.0 + }, + { + "epoch": 0.45, + "full_loss": 0.1843, + "grad_norm": 1.359375, + "learning_rate": 2.2533895474137047e-05, + "long_answer_loss": 0.1843, + "loss": 0.1619, + "short_answer_loss": NaN, + "step": 595, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1571, + "grad_norm": 1.359375, + "learning_rate": 2.2524656687113822e-05, + "long_answer_loss": 0.1571, + "loss": 0.1593, + "short_answer_loss": NaN, + "step": 596, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1842, + "grad_norm": 1.4921875, + "learning_rate": 2.251540252816535e-05, + "long_answer_loss": 0.1842, + "loss": 0.1646, + "short_answer_loss": NaN, + "step": 597, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1378, + "grad_norm": 1.46875, + "learning_rate": 2.2506133011482075e-05, + "long_answer_loss": 0.1378, + "loss": 0.1564, + "short_answer_loss": NaN, + "step": 598, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1795, + "grad_norm": 1.609375, + "learning_rate": 2.2496848151277973e-05, + "long_answer_loss": 0.1795, + "loss": 0.1634, + "short_answer_loss": NaN, + "step": 599, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.182, + "grad_norm": 1.5703125, + "learning_rate": 2.2487547961790556e-05, + "long_answer_loss": 0.182, + "loss": 0.1685, + "short_answer_loss": NaN, + "step": 600, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1688, + "grad_norm": 1.5, + "learning_rate": 2.2478232457280845e-05, + "long_answer_loss": 0.1688, + "loss": 0.1604, + "short_answer_loss": NaN, + "step": 601, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.135, + "grad_norm": 1.3984375, + "learning_rate": 2.2468901652033346e-05, + "long_answer_loss": 0.135, + "loss": 0.153, + "short_answer_loss": NaN, + "step": 602, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1504, + "grad_norm": 1.46875, + "learning_rate": 2.2459555560356023e-05, + "long_answer_loss": 0.1504, + "loss": 0.1553, + "short_answer_loss": NaN, + "step": 603, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1645, + "grad_norm": 1.6171875, + "learning_rate": 2.2450194196580278e-05, + "long_answer_loss": 0.1645, + "loss": 0.1645, + "short_answer_loss": NaN, + "step": 604, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1687, + "grad_norm": 1.421875, + "learning_rate": 2.244081757506094e-05, + "long_answer_loss": 0.1687, + "loss": 0.157, + "short_answer_loss": NaN, + "step": 605, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1474, + "grad_norm": 1.421875, + "learning_rate": 2.2431425710176226e-05, + "long_answer_loss": 0.1474, + "loss": 0.1563, + "short_answer_loss": NaN, + "step": 606, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1731, + "grad_norm": 1.4453125, + "learning_rate": 2.2422018616327734e-05, + "long_answer_loss": 0.1731, + "loss": 0.1666, + "short_answer_loss": NaN, + "step": 607, + "template_loss": 0.0 + }, + { + "epoch": 0.46, + "full_loss": 0.1654, + "grad_norm": 1.53125, + "learning_rate": 2.241259630794041e-05, + "long_answer_loss": 0.1654, + "loss": 0.1572, + "short_answer_loss": NaN, + "step": 608, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1783, + "grad_norm": 1.6015625, + "learning_rate": 2.2403158799462524e-05, + "long_answer_loss": 0.1783, + "loss": 0.1667, + "short_answer_loss": NaN, + "step": 609, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1855, + "grad_norm": 1.4765625, + "learning_rate": 2.239370610536568e-05, + "long_answer_loss": 0.1855, + "loss": 0.1664, + "short_answer_loss": NaN, + "step": 610, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.204, + "grad_norm": 1.4609375, + "learning_rate": 2.238423824014473e-05, + "long_answer_loss": 0.204, + "loss": 0.1666, + "short_answer_loss": NaN, + "step": 611, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1617, + "grad_norm": 1.484375, + "learning_rate": 2.2374755218317817e-05, + "long_answer_loss": 0.1617, + "loss": 0.1639, + "short_answer_loss": NaN, + "step": 612, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1845, + "grad_norm": 1.53125, + "learning_rate": 2.2365257054426315e-05, + "long_answer_loss": 0.1845, + "loss": 0.165, + "short_answer_loss": NaN, + "step": 613, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1409, + "grad_norm": 1.3984375, + "learning_rate": 2.2355743763034825e-05, + "long_answer_loss": 0.1409, + "loss": 0.1524, + "short_answer_loss": NaN, + "step": 614, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1681, + "grad_norm": 1.4453125, + "learning_rate": 2.234621535873113e-05, + "long_answer_loss": 0.1681, + "loss": 0.1571, + "short_answer_loss": NaN, + "step": 615, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1709, + "grad_norm": 1.390625, + "learning_rate": 2.23366718561262e-05, + "long_answer_loss": 0.1709, + "loss": 0.1564, + "short_answer_loss": NaN, + "step": 616, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1757, + "grad_norm": 1.515625, + "learning_rate": 2.2327113269854154e-05, + "long_answer_loss": 0.1757, + "loss": 0.1657, + "short_answer_loss": NaN, + "step": 617, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1609, + "grad_norm": 1.5390625, + "learning_rate": 2.231753961457224e-05, + "long_answer_loss": 0.1609, + "loss": 0.1656, + "short_answer_loss": NaN, + "step": 618, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1462, + "grad_norm": 1.3671875, + "learning_rate": 2.2307950904960813e-05, + "long_answer_loss": 0.1462, + "loss": 0.1584, + "short_answer_loss": NaN, + "step": 619, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1626, + "grad_norm": 1.453125, + "learning_rate": 2.2298347155723302e-05, + "long_answer_loss": 0.1626, + "loss": 0.1523, + "short_answer_loss": NaN, + "step": 620, + "template_loss": 0.0 + }, + { + "epoch": 0.47, + "full_loss": 0.1791, + "grad_norm": 1.5, + "learning_rate": 2.2288728381586224e-05, + "long_answer_loss": 0.1791, + "loss": 0.1635, + "short_answer_loss": NaN, + "step": 621, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.146, + "grad_norm": 1.453125, + "learning_rate": 2.2279094597299108e-05, + "long_answer_loss": 0.146, + "loss": 0.1553, + "short_answer_loss": NaN, + "step": 622, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1659, + "grad_norm": 1.375, + "learning_rate": 2.2269445817634514e-05, + "long_answer_loss": 0.1659, + "loss": 0.1549, + "short_answer_loss": NaN, + "step": 623, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1679, + "grad_norm": 1.4296875, + "learning_rate": 2.2259782057387994e-05, + "long_answer_loss": 0.1679, + "loss": 0.1658, + "short_answer_loss": NaN, + "step": 624, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.188, + "grad_norm": 1.4375, + "learning_rate": 2.2250103331378067e-05, + "long_answer_loss": 0.188, + "loss": 0.158, + "short_answer_loss": NaN, + "step": 625, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1569, + "grad_norm": 1.3984375, + "learning_rate": 2.224040965444621e-05, + "long_answer_loss": 0.1569, + "loss": 0.158, + "short_answer_loss": NaN, + "step": 626, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1536, + "grad_norm": 1.421875, + "learning_rate": 2.2230701041456814e-05, + "long_answer_loss": 0.1536, + "loss": 0.1516, + "short_answer_loss": NaN, + "step": 627, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1335, + "grad_norm": 1.4140625, + "learning_rate": 2.222097750729718e-05, + "long_answer_loss": 0.1335, + "loss": 0.1654, + "short_answer_loss": NaN, + "step": 628, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.17, + "grad_norm": 1.4453125, + "learning_rate": 2.221123906687749e-05, + "long_answer_loss": 0.17, + "loss": 0.1615, + "short_answer_loss": NaN, + "step": 629, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.182, + "grad_norm": 1.3671875, + "learning_rate": 2.2201485735130787e-05, + "long_answer_loss": 0.182, + "loss": 0.164, + "short_answer_loss": NaN, + "step": 630, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1433, + "grad_norm": 1.453125, + "learning_rate": 2.2191717527012935e-05, + "long_answer_loss": 0.1433, + "loss": 0.1515, + "short_answer_loss": NaN, + "step": 631, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1535, + "grad_norm": 1.4140625, + "learning_rate": 2.2181934457502622e-05, + "long_answer_loss": 0.1535, + "loss": 0.155, + "short_answer_loss": NaN, + "step": 632, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1635, + "grad_norm": 1.453125, + "learning_rate": 2.2172136541601322e-05, + "long_answer_loss": 0.1635, + "loss": 0.1578, + "short_answer_loss": NaN, + "step": 633, + "template_loss": 0.0 + }, + { + "epoch": 0.48, + "full_loss": 0.1698, + "grad_norm": 1.46875, + "learning_rate": 2.216232379433327e-05, + "long_answer_loss": 0.1698, + "loss": 0.1715, + "short_answer_loss": NaN, + "step": 634, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1945, + "grad_norm": 1.3828125, + "learning_rate": 2.2152496230745447e-05, + "long_answer_loss": 0.1945, + "loss": 0.1623, + "short_answer_loss": NaN, + "step": 635, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1695, + "grad_norm": 1.46875, + "learning_rate": 2.2142653865907557e-05, + "long_answer_loss": 0.1695, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 636, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.163, + "grad_norm": 1.4375, + "learning_rate": 2.2132796714911998e-05, + "long_answer_loss": 0.163, + "loss": 0.1615, + "short_answer_loss": NaN, + "step": 637, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1437, + "grad_norm": 1.4375, + "learning_rate": 2.2122924792873827e-05, + "long_answer_loss": 0.1437, + "loss": 0.1662, + "short_answer_loss": NaN, + "step": 638, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1582, + "grad_norm": 1.4921875, + "learning_rate": 2.211303811493078e-05, + "long_answer_loss": 0.1582, + "loss": 0.1603, + "short_answer_loss": NaN, + "step": 639, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1522, + "grad_norm": 1.4765625, + "learning_rate": 2.2103136696243197e-05, + "long_answer_loss": 0.1522, + "loss": 0.1593, + "short_answer_loss": NaN, + "step": 640, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1626, + "grad_norm": 1.375, + "learning_rate": 2.2093220551994033e-05, + "long_answer_loss": 0.1626, + "loss": 0.1483, + "short_answer_loss": NaN, + "step": 641, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1684, + "grad_norm": 1.4296875, + "learning_rate": 2.2083289697388808e-05, + "long_answer_loss": 0.1684, + "loss": 0.1642, + "short_answer_loss": NaN, + "step": 642, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1512, + "grad_norm": 1.5078125, + "learning_rate": 2.207334414765562e-05, + "long_answer_loss": 0.1512, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 643, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1601, + "grad_norm": 1.40625, + "learning_rate": 2.2063383918045092e-05, + "long_answer_loss": 0.1601, + "loss": 0.1625, + "short_answer_loss": NaN, + "step": 644, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1868, + "grad_norm": 1.46875, + "learning_rate": 2.2053409023830353e-05, + "long_answer_loss": 0.1868, + "loss": 0.1692, + "short_answer_loss": NaN, + "step": 645, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1715, + "grad_norm": 1.4921875, + "learning_rate": 2.204341948030702e-05, + "long_answer_loss": 0.1715, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 646, + "template_loss": 0.0 + }, + { + "epoch": 0.49, + "full_loss": 0.1579, + "grad_norm": 1.453125, + "learning_rate": 2.2033415302793173e-05, + "long_answer_loss": 0.1579, + "loss": 0.1539, + "short_answer_loss": NaN, + "step": 647, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1937, + "grad_norm": 1.4375, + "learning_rate": 2.202339650662934e-05, + "long_answer_loss": 0.1937, + "loss": 0.1533, + "short_answer_loss": NaN, + "step": 648, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1718, + "grad_norm": 1.546875, + "learning_rate": 2.2013363107178454e-05, + "long_answer_loss": 0.1718, + "loss": 0.1625, + "short_answer_loss": NaN, + "step": 649, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1678, + "grad_norm": 1.46875, + "learning_rate": 2.2003315119825856e-05, + "long_answer_loss": 0.1678, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 650, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1645, + "grad_norm": 1.5625, + "learning_rate": 2.199325255997923e-05, + "long_answer_loss": 0.1645, + "loss": 0.1689, + "short_answer_loss": NaN, + "step": 651, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1483, + "grad_norm": 1.421875, + "learning_rate": 2.1983175443068645e-05, + "long_answer_loss": 0.1483, + "loss": 0.1637, + "short_answer_loss": NaN, + "step": 652, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1693, + "grad_norm": 1.5546875, + "learning_rate": 2.1973083784546454e-05, + "long_answer_loss": 0.1693, + "loss": 0.1675, + "short_answer_loss": NaN, + "step": 653, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1406, + "grad_norm": 1.3359375, + "learning_rate": 2.1962977599887324e-05, + "long_answer_loss": 0.1406, + "loss": 0.1513, + "short_answer_loss": NaN, + "step": 654, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1466, + "grad_norm": 1.4765625, + "learning_rate": 2.19528569045882e-05, + "long_answer_loss": 0.1466, + "loss": 0.1656, + "short_answer_loss": NaN, + "step": 655, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1671, + "grad_norm": 1.484375, + "learning_rate": 2.1942721714168274e-05, + "long_answer_loss": 0.1671, + "loss": 0.1652, + "short_answer_loss": NaN, + "step": 656, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1676, + "grad_norm": 1.46875, + "learning_rate": 2.1932572044168964e-05, + "long_answer_loss": 0.1676, + "loss": 0.1648, + "short_answer_loss": NaN, + "step": 657, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1671, + "grad_norm": 1.4609375, + "learning_rate": 2.1922407910153895e-05, + "long_answer_loss": 0.1671, + "loss": 0.1618, + "short_answer_loss": NaN, + "step": 658, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1679, + "grad_norm": 1.46875, + "learning_rate": 2.191222932770886e-05, + "long_answer_loss": 0.1679, + "loss": 0.1688, + "short_answer_loss": NaN, + "step": 659, + "template_loss": 0.0 + }, + { + "epoch": 0.5, + "full_loss": 0.1853, + "grad_norm": 1.484375, + "learning_rate": 2.1902036312441824e-05, + "long_answer_loss": 0.1853, + "loss": 0.1664, + "short_answer_loss": NaN, + "step": 660, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1532, + "grad_norm": 1.4140625, + "learning_rate": 2.1891828879982877e-05, + "long_answer_loss": 0.1532, + "loss": 0.1532, + "short_answer_loss": NaN, + "step": 661, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1579, + "grad_norm": 1.5, + "learning_rate": 2.1881607045984202e-05, + "long_answer_loss": 0.1579, + "loss": 0.1651, + "short_answer_loss": NaN, + "step": 662, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1708, + "grad_norm": 1.421875, + "learning_rate": 2.1871370826120093e-05, + "long_answer_loss": 0.1708, + "loss": 0.1526, + "short_answer_loss": NaN, + "step": 663, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1641, + "grad_norm": 1.46875, + "learning_rate": 2.186112023608688e-05, + "long_answer_loss": 0.1641, + "loss": 0.1633, + "short_answer_loss": NaN, + "step": 664, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1567, + "grad_norm": 1.5234375, + "learning_rate": 2.1850855291602942e-05, + "long_answer_loss": 0.1567, + "loss": 0.1582, + "short_answer_loss": NaN, + "step": 665, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1713, + "grad_norm": 1.53125, + "learning_rate": 2.184057600840866e-05, + "long_answer_loss": 0.1713, + "loss": 0.1637, + "short_answer_loss": NaN, + "step": 666, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1682, + "grad_norm": 1.4921875, + "learning_rate": 2.1830282402266407e-05, + "long_answer_loss": 0.1682, + "loss": 0.1545, + "short_answer_loss": NaN, + "step": 667, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1714, + "grad_norm": 1.59375, + "learning_rate": 2.181997448896052e-05, + "long_answer_loss": 0.1714, + "loss": 0.1571, + "short_answer_loss": NaN, + "step": 668, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1608, + "grad_norm": 1.5234375, + "learning_rate": 2.1809652284297275e-05, + "long_answer_loss": 0.1608, + "loss": 0.1614, + "short_answer_loss": NaN, + "step": 669, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1385, + "grad_norm": 1.6015625, + "learning_rate": 2.1799315804104858e-05, + "long_answer_loss": 0.1385, + "loss": 0.1701, + "short_answer_loss": NaN, + "step": 670, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1628, + "grad_norm": 1.53125, + "learning_rate": 2.1788965064233346e-05, + "long_answer_loss": 0.1628, + "loss": 0.1674, + "short_answer_loss": NaN, + "step": 671, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.1639, + "grad_norm": 1.3984375, + "learning_rate": 2.177860008055469e-05, + "long_answer_loss": 0.1639, + "loss": 0.1571, + "short_answer_loss": NaN, + "step": 672, + "template_loss": 0.0 + }, + { + "epoch": 0.51, + "full_loss": 0.143, + "grad_norm": 1.421875, + "learning_rate": 2.1768220868962675e-05, + "long_answer_loss": 0.143, + "loss": 0.1505, + "short_answer_loss": NaN, + "step": 673, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1511, + "grad_norm": 1.3828125, + "learning_rate": 2.1757827445372896e-05, + "long_answer_loss": 0.1511, + "loss": 0.1514, + "short_answer_loss": NaN, + "step": 674, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1556, + "grad_norm": 1.484375, + "learning_rate": 2.174741982572276e-05, + "long_answer_loss": 0.1556, + "loss": 0.1587, + "short_answer_loss": NaN, + "step": 675, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.154, + "grad_norm": 1.4375, + "learning_rate": 2.1736998025971433e-05, + "long_answer_loss": 0.154, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 676, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1721, + "grad_norm": 1.4609375, + "learning_rate": 2.1726562062099816e-05, + "long_answer_loss": 0.1721, + "loss": 0.1676, + "short_answer_loss": NaN, + "step": 677, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1666, + "grad_norm": 1.3828125, + "learning_rate": 2.1716111950110545e-05, + "long_answer_loss": 0.1666, + "loss": 0.172, + "short_answer_loss": NaN, + "step": 678, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1712, + "grad_norm": 1.4453125, + "learning_rate": 2.1705647706027938e-05, + "long_answer_loss": 0.1712, + "loss": 0.1585, + "short_answer_loss": NaN, + "step": 679, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1519, + "grad_norm": 1.53125, + "learning_rate": 2.1695169345897993e-05, + "long_answer_loss": 0.1519, + "loss": 0.1538, + "short_answer_loss": NaN, + "step": 680, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1544, + "grad_norm": 1.6484375, + "learning_rate": 2.168467688578834e-05, + "long_answer_loss": 0.1544, + "loss": 0.1557, + "short_answer_loss": NaN, + "step": 681, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1447, + "grad_norm": 1.3671875, + "learning_rate": 2.167417034178825e-05, + "long_answer_loss": 0.1447, + "loss": 0.1622, + "short_answer_loss": NaN, + "step": 682, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1723, + "grad_norm": 1.4609375, + "learning_rate": 2.166364973000858e-05, + "long_answer_loss": 0.1723, + "loss": 0.169, + "short_answer_loss": NaN, + "step": 683, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1506, + "grad_norm": 1.3671875, + "learning_rate": 2.1653115066581752e-05, + "long_answer_loss": 0.1506, + "loss": 0.1597, + "short_answer_loss": NaN, + "step": 684, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1669, + "grad_norm": 1.4140625, + "learning_rate": 2.1642566367661744e-05, + "long_answer_loss": 0.1669, + "loss": 0.1546, + "short_answer_loss": NaN, + "step": 685, + "template_loss": 0.0 + }, + { + "epoch": 0.52, + "full_loss": 0.1499, + "grad_norm": 1.421875, + "learning_rate": 2.1632003649424054e-05, + "long_answer_loss": 0.1499, + "loss": 0.1492, + "short_answer_loss": NaN, + "step": 686, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1791, + "grad_norm": 1.484375, + "learning_rate": 2.162142692806568e-05, + "long_answer_loss": 0.1791, + "loss": 0.161, + "short_answer_loss": NaN, + "step": 687, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1576, + "grad_norm": 1.4140625, + "learning_rate": 2.1610836219805085e-05, + "long_answer_loss": 0.1576, + "loss": 0.1541, + "short_answer_loss": NaN, + "step": 688, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1531, + "grad_norm": 1.3828125, + "learning_rate": 2.1600231540882184e-05, + "long_answer_loss": 0.1531, + "loss": 0.155, + "short_answer_loss": NaN, + "step": 689, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1639, + "grad_norm": 1.5078125, + "learning_rate": 2.158961290755832e-05, + "long_answer_loss": 0.1639, + "loss": 0.1626, + "short_answer_loss": NaN, + "step": 690, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.166, + "grad_norm": 1.421875, + "learning_rate": 2.1578980336116226e-05, + "long_answer_loss": 0.166, + "loss": 0.1628, + "short_answer_loss": NaN, + "step": 691, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1709, + "grad_norm": 1.4921875, + "learning_rate": 2.1568333842860007e-05, + "long_answer_loss": 0.1709, + "loss": 0.1614, + "short_answer_loss": NaN, + "step": 692, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1704, + "grad_norm": 1.421875, + "learning_rate": 2.1557673444115127e-05, + "long_answer_loss": 0.1704, + "loss": 0.1557, + "short_answer_loss": NaN, + "step": 693, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1628, + "grad_norm": 1.3515625, + "learning_rate": 2.1546999156228366e-05, + "long_answer_loss": 0.1628, + "loss": 0.1547, + "short_answer_loss": NaN, + "step": 694, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1591, + "grad_norm": 1.375, + "learning_rate": 2.1536310995567794e-05, + "long_answer_loss": 0.1591, + "loss": 0.1561, + "short_answer_loss": NaN, + "step": 695, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1692, + "grad_norm": 1.3984375, + "learning_rate": 2.152560897852276e-05, + "long_answer_loss": 0.1692, + "loss": 0.1617, + "short_answer_loss": NaN, + "step": 696, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1663, + "grad_norm": 1.3984375, + "learning_rate": 2.151489312150387e-05, + "long_answer_loss": 0.1663, + "loss": 0.1543, + "short_answer_loss": NaN, + "step": 697, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1695, + "grad_norm": 1.421875, + "learning_rate": 2.150416344094294e-05, + "long_answer_loss": 0.1695, + "loss": 0.1591, + "short_answer_loss": NaN, + "step": 698, + "template_loss": 0.0 + }, + { + "epoch": 0.53, + "full_loss": 0.1523, + "grad_norm": 1.3515625, + "learning_rate": 2.149341995329299e-05, + "long_answer_loss": 0.1523, + "loss": 0.1537, + "short_answer_loss": NaN, + "step": 699, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1699, + "grad_norm": 1.4140625, + "learning_rate": 2.14826626750282e-05, + "long_answer_loss": 0.1699, + "loss": 0.1606, + "short_answer_loss": NaN, + "step": 700, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1369, + "grad_norm": 1.515625, + "learning_rate": 2.147189162264391e-05, + "long_answer_loss": 0.1369, + "loss": 0.1605, + "short_answer_loss": NaN, + "step": 701, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1477, + "grad_norm": 1.3828125, + "learning_rate": 2.1461106812656583e-05, + "long_answer_loss": 0.1477, + "loss": 0.1572, + "short_answer_loss": NaN, + "step": 702, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1557, + "grad_norm": 1.3984375, + "learning_rate": 2.145030826160377e-05, + "long_answer_loss": 0.1557, + "loss": 0.1486, + "short_answer_loss": NaN, + "step": 703, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1728, + "grad_norm": 1.5390625, + "learning_rate": 2.1439495986044088e-05, + "long_answer_loss": 0.1728, + "loss": 0.1681, + "short_answer_loss": NaN, + "step": 704, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1508, + "grad_norm": 1.4375, + "learning_rate": 2.142867000255721e-05, + "long_answer_loss": 0.1508, + "loss": 0.153, + "short_answer_loss": NaN, + "step": 705, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1616, + "grad_norm": 1.4296875, + "learning_rate": 2.141783032774383e-05, + "long_answer_loss": 0.1616, + "loss": 0.1578, + "short_answer_loss": NaN, + "step": 706, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1402, + "grad_norm": 1.4765625, + "learning_rate": 2.1406976978225623e-05, + "long_answer_loss": 0.1402, + "loss": 0.1515, + "short_answer_loss": NaN, + "step": 707, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1611, + "grad_norm": 1.4375, + "learning_rate": 2.139610997064525e-05, + "long_answer_loss": 0.1611, + "loss": 0.1528, + "short_answer_loss": NaN, + "step": 708, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1608, + "grad_norm": 1.4296875, + "learning_rate": 2.1385229321666304e-05, + "long_answer_loss": 0.1608, + "loss": 0.1584, + "short_answer_loss": NaN, + "step": 709, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1673, + "grad_norm": 1.6015625, + "learning_rate": 2.1374335047973292e-05, + "long_answer_loss": 0.1673, + "loss": 0.1642, + "short_answer_loss": NaN, + "step": 710, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1603, + "grad_norm": 1.53125, + "learning_rate": 2.1363427166271632e-05, + "long_answer_loss": 0.1603, + "loss": 0.1639, + "short_answer_loss": NaN, + "step": 711, + "template_loss": 0.0 + }, + { + "epoch": 0.54, + "full_loss": 0.1876, + "grad_norm": 1.40625, + "learning_rate": 2.1352505693287587e-05, + "long_answer_loss": 0.1876, + "loss": 0.1615, + "short_answer_loss": NaN, + "step": 712, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1583, + "grad_norm": 1.4609375, + "learning_rate": 2.1341570645768273e-05, + "long_answer_loss": 0.1583, + "loss": 0.1627, + "short_answer_loss": NaN, + "step": 713, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1504, + "grad_norm": 1.4375, + "learning_rate": 2.1330622040481624e-05, + "long_answer_loss": 0.1504, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 714, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1234, + "grad_norm": 1.375, + "learning_rate": 2.1319659894216355e-05, + "long_answer_loss": 0.1234, + "loss": 0.1463, + "short_answer_loss": NaN, + "step": 715, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1471, + "grad_norm": 1.4453125, + "learning_rate": 2.1308684223781945e-05, + "long_answer_loss": 0.1471, + "loss": 0.1562, + "short_answer_loss": NaN, + "step": 716, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1731, + "grad_norm": 1.5859375, + "learning_rate": 2.129769504600862e-05, + "long_answer_loss": 0.1731, + "loss": 0.1657, + "short_answer_loss": NaN, + "step": 717, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1534, + "grad_norm": 1.390625, + "learning_rate": 2.1286692377747315e-05, + "long_answer_loss": 0.1534, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 718, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1623, + "grad_norm": 1.546875, + "learning_rate": 2.1275676235869644e-05, + "long_answer_loss": 0.1623, + "loss": 0.1592, + "short_answer_loss": NaN, + "step": 719, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1562, + "grad_norm": 1.421875, + "learning_rate": 2.1264646637267886e-05, + "long_answer_loss": 0.1562, + "loss": 0.1566, + "short_answer_loss": NaN, + "step": 720, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1464, + "grad_norm": 1.4921875, + "learning_rate": 2.1253603598854964e-05, + "long_answer_loss": 0.1464, + "loss": 0.1647, + "short_answer_loss": NaN, + "step": 721, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1665, + "grad_norm": 1.3515625, + "learning_rate": 2.12425471375644e-05, + "long_answer_loss": 0.1665, + "loss": 0.154, + "short_answer_loss": NaN, + "step": 722, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1601, + "grad_norm": 1.4453125, + "learning_rate": 2.1231477270350293e-05, + "long_answer_loss": 0.1601, + "loss": 0.1509, + "short_answer_loss": NaN, + "step": 723, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1454, + "grad_norm": 1.453125, + "learning_rate": 2.1220394014187312e-05, + "long_answer_loss": 0.1454, + "loss": 0.1603, + "short_answer_loss": NaN, + "step": 724, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.164, + "grad_norm": 1.421875, + "learning_rate": 2.1209297386070647e-05, + "long_answer_loss": 0.164, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 725, + "template_loss": 0.0 + }, + { + "epoch": 0.55, + "full_loss": 0.1379, + "grad_norm": 1.359375, + "learning_rate": 2.1198187403016e-05, + "long_answer_loss": 0.1379, + "loss": 0.1564, + "short_answer_loss": NaN, + "step": 726, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1616, + "grad_norm": 1.390625, + "learning_rate": 2.118706408205955e-05, + "long_answer_loss": 0.1616, + "loss": 0.1562, + "short_answer_loss": NaN, + "step": 727, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1567, + "grad_norm": 1.453125, + "learning_rate": 2.1175927440257926e-05, + "long_answer_loss": 0.1567, + "loss": 0.1581, + "short_answer_loss": NaN, + "step": 728, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1605, + "grad_norm": 1.5625, + "learning_rate": 2.1164777494688178e-05, + "long_answer_loss": 0.1605, + "loss": 0.1655, + "short_answer_loss": NaN, + "step": 729, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1674, + "grad_norm": 1.453125, + "learning_rate": 2.115361426244777e-05, + "long_answer_loss": 0.1674, + "loss": 0.1546, + "short_answer_loss": NaN, + "step": 730, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.168, + "grad_norm": 1.375, + "learning_rate": 2.114243776065453e-05, + "long_answer_loss": 0.168, + "loss": 0.1618, + "short_answer_loss": NaN, + "step": 731, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1741, + "grad_norm": 1.4921875, + "learning_rate": 2.1131248006446635e-05, + "long_answer_loss": 0.1741, + "loss": 0.1667, + "short_answer_loss": NaN, + "step": 732, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.173, + "grad_norm": 1.46875, + "learning_rate": 2.1120045016982585e-05, + "long_answer_loss": 0.173, + "loss": 0.1653, + "short_answer_loss": NaN, + "step": 733, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1371, + "grad_norm": 1.4140625, + "learning_rate": 2.110882880944117e-05, + "long_answer_loss": 0.1371, + "loss": 0.151, + "short_answer_loss": NaN, + "step": 734, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1731, + "grad_norm": 1.46875, + "learning_rate": 2.109759940102146e-05, + "long_answer_loss": 0.1731, + "loss": 0.1581, + "short_answer_loss": NaN, + "step": 735, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1223, + "grad_norm": 1.3203125, + "learning_rate": 2.1086356808942758e-05, + "long_answer_loss": 0.1223, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 736, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1406, + "grad_norm": 1.4453125, + "learning_rate": 2.1075101050444583e-05, + "long_answer_loss": 0.1406, + "loss": 0.1555, + "short_answer_loss": NaN, + "step": 737, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1444, + "grad_norm": 1.53125, + "learning_rate": 2.1063832142786652e-05, + "long_answer_loss": 0.1444, + "loss": 0.1646, + "short_answer_loss": NaN, + "step": 738, + "template_loss": 0.0 + }, + { + "epoch": 0.56, + "full_loss": 0.1632, + "grad_norm": 1.5625, + "learning_rate": 2.1052550103248836e-05, + "long_answer_loss": 0.1632, + "loss": 0.1588, + "short_answer_loss": NaN, + "step": 739, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.153, + "grad_norm": 1.4609375, + "learning_rate": 2.1041254949131143e-05, + "long_answer_loss": 0.153, + "loss": 0.1566, + "short_answer_loss": NaN, + "step": 740, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1593, + "grad_norm": 1.390625, + "learning_rate": 2.1029946697753693e-05, + "long_answer_loss": 0.1593, + "loss": 0.1514, + "short_answer_loss": NaN, + "step": 741, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1732, + "grad_norm": 1.3984375, + "learning_rate": 2.10186253664567e-05, + "long_answer_loss": 0.1732, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 742, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1384, + "grad_norm": 1.3203125, + "learning_rate": 2.1007290972600415e-05, + "long_answer_loss": 0.1384, + "loss": 0.1427, + "short_answer_loss": NaN, + "step": 743, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1521, + "grad_norm": 1.3828125, + "learning_rate": 2.0995943533565136e-05, + "long_answer_loss": 0.1521, + "loss": 0.1532, + "short_answer_loss": NaN, + "step": 744, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1557, + "grad_norm": 1.5234375, + "learning_rate": 2.0984583066751152e-05, + "long_answer_loss": 0.1557, + "loss": 0.1563, + "short_answer_loss": NaN, + "step": 745, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1462, + "grad_norm": 1.40625, + "learning_rate": 2.0973209589578742e-05, + "long_answer_loss": 0.1462, + "loss": 0.1545, + "short_answer_loss": NaN, + "step": 746, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1669, + "grad_norm": 1.4140625, + "learning_rate": 2.0961823119488115e-05, + "long_answer_loss": 0.1669, + "loss": 0.1576, + "short_answer_loss": NaN, + "step": 747, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1665, + "grad_norm": 1.4453125, + "learning_rate": 2.0950423673939435e-05, + "long_answer_loss": 0.1665, + "loss": 0.1636, + "short_answer_loss": NaN, + "step": 748, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1561, + "grad_norm": 1.46875, + "learning_rate": 2.0939011270412735e-05, + "long_answer_loss": 0.1561, + "loss": 0.151, + "short_answer_loss": NaN, + "step": 749, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.173, + "grad_norm": 1.3671875, + "learning_rate": 2.092758592640793e-05, + "long_answer_loss": 0.173, + "loss": 0.1494, + "short_answer_loss": NaN, + "step": 750, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.1618, + "grad_norm": 1.375, + "learning_rate": 2.0916147659444768e-05, + "long_answer_loss": 0.1618, + "loss": 0.1595, + "short_answer_loss": NaN, + "step": 751, + "template_loss": 0.0 + }, + { + "epoch": 0.57, + "full_loss": 0.139, + "grad_norm": 1.359375, + "learning_rate": 2.090469648706283e-05, + "long_answer_loss": 0.139, + "loss": 0.1557, + "short_answer_loss": NaN, + "step": 752, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1761, + "grad_norm": 1.390625, + "learning_rate": 2.089323242682147e-05, + "long_answer_loss": 0.1761, + "loss": 0.1582, + "short_answer_loss": NaN, + "step": 753, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1327, + "grad_norm": 1.4296875, + "learning_rate": 2.0881755496299817e-05, + "long_answer_loss": 0.1327, + "loss": 0.1499, + "short_answer_loss": NaN, + "step": 754, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.16, + "grad_norm": 1.453125, + "learning_rate": 2.0870265713096726e-05, + "long_answer_loss": 0.16, + "loss": 0.1573, + "short_answer_loss": NaN, + "step": 755, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1353, + "grad_norm": 1.4921875, + "learning_rate": 2.085876309483077e-05, + "long_answer_loss": 0.1353, + "loss": 0.1536, + "short_answer_loss": NaN, + "step": 756, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.129, + "grad_norm": 1.3984375, + "learning_rate": 2.084724765914019e-05, + "long_answer_loss": 0.129, + "loss": 0.151, + "short_answer_loss": NaN, + "step": 757, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1443, + "grad_norm": 1.4765625, + "learning_rate": 2.083571942368289e-05, + "long_answer_loss": 0.1443, + "loss": 0.1539, + "short_answer_loss": NaN, + "step": 758, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1633, + "grad_norm": 1.453125, + "learning_rate": 2.0824178406136407e-05, + "long_answer_loss": 0.1633, + "loss": 0.1595, + "short_answer_loss": NaN, + "step": 759, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1449, + "grad_norm": 1.328125, + "learning_rate": 2.0812624624197868e-05, + "long_answer_loss": 0.1449, + "loss": 0.1463, + "short_answer_loss": NaN, + "step": 760, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1805, + "grad_norm": 1.6015625, + "learning_rate": 2.0801058095583977e-05, + "long_answer_loss": 0.1805, + "loss": 0.1586, + "short_answer_loss": NaN, + "step": 761, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1466, + "grad_norm": 1.4609375, + "learning_rate": 2.078947883803098e-05, + "long_answer_loss": 0.1466, + "loss": 0.1549, + "short_answer_loss": NaN, + "step": 762, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1405, + "grad_norm": 1.484375, + "learning_rate": 2.0777886869294655e-05, + "long_answer_loss": 0.1405, + "loss": 0.1552, + "short_answer_loss": NaN, + "step": 763, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1516, + "grad_norm": 1.5, + "learning_rate": 2.076628220715025e-05, + "long_answer_loss": 0.1516, + "loss": 0.1591, + "short_answer_loss": NaN, + "step": 764, + "template_loss": 0.0 + }, + { + "epoch": 0.58, + "full_loss": 0.1504, + "grad_norm": 1.4453125, + "learning_rate": 2.0754664869392494e-05, + "long_answer_loss": 0.1504, + "loss": 0.1615, + "short_answer_loss": NaN, + "step": 765, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1533, + "grad_norm": 1.4609375, + "learning_rate": 2.0743034873835547e-05, + "long_answer_loss": 0.1533, + "loss": 0.1507, + "short_answer_loss": NaN, + "step": 766, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1544, + "grad_norm": 1.4140625, + "learning_rate": 2.0731392238312985e-05, + "long_answer_loss": 0.1544, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 767, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1276, + "grad_norm": 1.3828125, + "learning_rate": 2.0719736980677754e-05, + "long_answer_loss": 0.1276, + "loss": 0.1467, + "short_answer_loss": NaN, + "step": 768, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1534, + "grad_norm": 1.3515625, + "learning_rate": 2.0708069118802166e-05, + "long_answer_loss": 0.1534, + "loss": 0.1503, + "short_answer_loss": NaN, + "step": 769, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1577, + "grad_norm": 1.4375, + "learning_rate": 2.0696388670577852e-05, + "long_answer_loss": 0.1577, + "loss": 0.1567, + "short_answer_loss": NaN, + "step": 770, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1717, + "grad_norm": 1.40625, + "learning_rate": 2.068469565391575e-05, + "long_answer_loss": 0.1717, + "loss": 0.1565, + "short_answer_loss": NaN, + "step": 771, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1555, + "grad_norm": 1.3515625, + "learning_rate": 2.0672990086746067e-05, + "long_answer_loss": 0.1555, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 772, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1462, + "grad_norm": 1.390625, + "learning_rate": 2.066127198701826e-05, + "long_answer_loss": 0.1462, + "loss": 0.1559, + "short_answer_loss": NaN, + "step": 773, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1359, + "grad_norm": 1.421875, + "learning_rate": 2.0649541372700993e-05, + "long_answer_loss": 0.1359, + "loss": 0.1492, + "short_answer_loss": NaN, + "step": 774, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1595, + "grad_norm": 1.3515625, + "learning_rate": 2.063779826178213e-05, + "long_answer_loss": 0.1595, + "loss": 0.1528, + "short_answer_loss": NaN, + "step": 775, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1492, + "grad_norm": 1.5, + "learning_rate": 2.0626042672268692e-05, + "long_answer_loss": 0.1492, + "loss": 0.1499, + "short_answer_loss": NaN, + "step": 776, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.177, + "grad_norm": 1.4453125, + "learning_rate": 2.061427462218684e-05, + "long_answer_loss": 0.177, + "loss": 0.1546, + "short_answer_loss": NaN, + "step": 777, + "template_loss": 0.0 + }, + { + "epoch": 0.59, + "full_loss": 0.1602, + "grad_norm": 1.4453125, + "learning_rate": 2.060249412958184e-05, + "long_answer_loss": 0.1602, + "loss": 0.1683, + "short_answer_loss": NaN, + "step": 778, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1405, + "grad_norm": 1.34375, + "learning_rate": 2.059070121251803e-05, + "long_answer_loss": 0.1405, + "loss": 0.1478, + "short_answer_loss": NaN, + "step": 779, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1483, + "grad_norm": 1.53125, + "learning_rate": 2.057889588907881e-05, + "long_answer_loss": 0.1483, + "loss": 0.1505, + "short_answer_loss": NaN, + "step": 780, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1368, + "grad_norm": 1.421875, + "learning_rate": 2.05670781773666e-05, + "long_answer_loss": 0.1368, + "loss": 0.16, + "short_answer_loss": NaN, + "step": 781, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1582, + "grad_norm": 1.421875, + "learning_rate": 2.0555248095502823e-05, + "long_answer_loss": 0.1582, + "loss": 0.1583, + "short_answer_loss": NaN, + "step": 782, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1473, + "grad_norm": 1.5, + "learning_rate": 2.054340566162785e-05, + "long_answer_loss": 0.1473, + "loss": 0.1561, + "short_answer_loss": NaN, + "step": 783, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1368, + "grad_norm": 1.515625, + "learning_rate": 2.053155089390102e-05, + "long_answer_loss": 0.1368, + "loss": 0.1545, + "short_answer_loss": NaN, + "step": 784, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1446, + "grad_norm": 1.359375, + "learning_rate": 2.0519683810500568e-05, + "long_answer_loss": 0.1446, + "loss": 0.1473, + "short_answer_loss": NaN, + "step": 785, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1443, + "grad_norm": 1.34375, + "learning_rate": 2.0507804429623613e-05, + "long_answer_loss": 0.1443, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 786, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1292, + "grad_norm": 1.546875, + "learning_rate": 2.0495912769486143e-05, + "long_answer_loss": 0.1292, + "loss": 0.1502, + "short_answer_loss": NaN, + "step": 787, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1566, + "grad_norm": 1.5703125, + "learning_rate": 2.0484008848322962e-05, + "long_answer_loss": 0.1566, + "loss": 0.1563, + "short_answer_loss": NaN, + "step": 788, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1532, + "grad_norm": 1.46875, + "learning_rate": 2.0472092684387688e-05, + "long_answer_loss": 0.1532, + "loss": 0.1493, + "short_answer_loss": NaN, + "step": 789, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1707, + "grad_norm": 1.4375, + "learning_rate": 2.04601642959527e-05, + "long_answer_loss": 0.1707, + "loss": 0.1552, + "short_answer_loss": NaN, + "step": 790, + "template_loss": 0.0 + }, + { + "epoch": 0.6, + "full_loss": 0.1538, + "grad_norm": 1.4140625, + "learning_rate": 2.0448223701309126e-05, + "long_answer_loss": 0.1538, + "loss": 0.1531, + "short_answer_loss": NaN, + "step": 791, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1559, + "grad_norm": 1.4453125, + "learning_rate": 2.043627091876682e-05, + "long_answer_loss": 0.1559, + "loss": 0.1566, + "short_answer_loss": NaN, + "step": 792, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1547, + "grad_norm": 1.375, + "learning_rate": 2.0424305966654312e-05, + "long_answer_loss": 0.1547, + "loss": 0.1542, + "short_answer_loss": NaN, + "step": 793, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1472, + "grad_norm": 1.4765625, + "learning_rate": 2.0412328863318803e-05, + "long_answer_loss": 0.1472, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 794, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1577, + "grad_norm": 1.328125, + "learning_rate": 2.040033962712612e-05, + "long_answer_loss": 0.1577, + "loss": 0.1516, + "short_answer_loss": NaN, + "step": 795, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1296, + "grad_norm": 1.40625, + "learning_rate": 2.0388338276460695e-05, + "long_answer_loss": 0.1296, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 796, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.142, + "grad_norm": 1.4296875, + "learning_rate": 2.037632482972554e-05, + "long_answer_loss": 0.142, + "loss": 0.1471, + "short_answer_loss": NaN, + "step": 797, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1248, + "grad_norm": 1.3828125, + "learning_rate": 2.0364299305342223e-05, + "long_answer_loss": 0.1248, + "loss": 0.1519, + "short_answer_loss": NaN, + "step": 798, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1292, + "grad_norm": 1.4921875, + "learning_rate": 2.035226172175081e-05, + "long_answer_loss": 0.1292, + "loss": 0.1553, + "short_answer_loss": NaN, + "step": 799, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1608, + "grad_norm": 1.4765625, + "learning_rate": 2.0340212097409878e-05, + "long_answer_loss": 0.1608, + "loss": 0.1593, + "short_answer_loss": NaN, + "step": 800, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1463, + "grad_norm": 1.4375, + "learning_rate": 2.032815045079646e-05, + "long_answer_loss": 0.1463, + "loss": 0.1602, + "short_answer_loss": NaN, + "step": 801, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1599, + "grad_norm": 1.4921875, + "learning_rate": 2.0316076800406024e-05, + "long_answer_loss": 0.1599, + "loss": 0.1523, + "short_answer_loss": NaN, + "step": 802, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1606, + "grad_norm": 1.4453125, + "learning_rate": 2.0303991164752455e-05, + "long_answer_loss": 0.1606, + "loss": 0.1462, + "short_answer_loss": NaN, + "step": 803, + "template_loss": 0.0 + }, + { + "epoch": 0.61, + "full_loss": 0.1608, + "grad_norm": 1.4921875, + "learning_rate": 2.0291893562368e-05, + "long_answer_loss": 0.1608, + "loss": 0.1615, + "short_answer_loss": NaN, + "step": 804, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1578, + "grad_norm": 1.453125, + "learning_rate": 2.027978401180326e-05, + "long_answer_loss": 0.1578, + "loss": 0.1507, + "short_answer_loss": NaN, + "step": 805, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1508, + "grad_norm": 1.3828125, + "learning_rate": 2.0267662531627163e-05, + "long_answer_loss": 0.1508, + "loss": 0.1479, + "short_answer_loss": NaN, + "step": 806, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1581, + "grad_norm": 1.4921875, + "learning_rate": 2.025552914042693e-05, + "long_answer_loss": 0.1581, + "loss": 0.1589, + "short_answer_loss": NaN, + "step": 807, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1855, + "grad_norm": 1.5390625, + "learning_rate": 2.0243383856808046e-05, + "long_answer_loss": 0.1855, + "loss": 0.1555, + "short_answer_loss": NaN, + "step": 808, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1574, + "grad_norm": 1.4921875, + "learning_rate": 2.023122669939423e-05, + "long_answer_loss": 0.1574, + "loss": 0.1562, + "short_answer_loss": NaN, + "step": 809, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1511, + "grad_norm": 1.4765625, + "learning_rate": 2.02190576868274e-05, + "long_answer_loss": 0.1511, + "loss": 0.1534, + "short_answer_loss": NaN, + "step": 810, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1497, + "grad_norm": 1.390625, + "learning_rate": 2.0206876837767673e-05, + "long_answer_loss": 0.1497, + "loss": 0.1455, + "short_answer_loss": NaN, + "step": 811, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1658, + "grad_norm": 1.421875, + "learning_rate": 2.0194684170893296e-05, + "long_answer_loss": 0.1658, + "loss": 0.1461, + "short_answer_loss": NaN, + "step": 812, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1502, + "grad_norm": 1.5, + "learning_rate": 2.0182479704900654e-05, + "long_answer_loss": 0.1502, + "loss": 0.1458, + "short_answer_loss": NaN, + "step": 813, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1484, + "grad_norm": 1.390625, + "learning_rate": 2.017026345850421e-05, + "long_answer_loss": 0.1484, + "loss": 0.1643, + "short_answer_loss": NaN, + "step": 814, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1304, + "grad_norm": 1.46875, + "learning_rate": 2.0158035450436504e-05, + "long_answer_loss": 0.1304, + "loss": 0.155, + "short_answer_loss": NaN, + "step": 815, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.146, + "grad_norm": 1.421875, + "learning_rate": 2.01457956994481e-05, + "long_answer_loss": 0.146, + "loss": 0.1557, + "short_answer_loss": NaN, + "step": 816, + "template_loss": 0.0 + }, + { + "epoch": 0.62, + "full_loss": 0.1492, + "grad_norm": 1.4296875, + "learning_rate": 2.0133544224307582e-05, + "long_answer_loss": 0.1492, + "loss": 0.1531, + "short_answer_loss": NaN, + "step": 817, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1572, + "grad_norm": 1.421875, + "learning_rate": 2.0121281043801498e-05, + "long_answer_loss": 0.1572, + "loss": 0.1527, + "short_answer_loss": NaN, + "step": 818, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1398, + "grad_norm": 1.4609375, + "learning_rate": 2.0109006176734356e-05, + "long_answer_loss": 0.1398, + "loss": 0.1535, + "short_answer_loss": NaN, + "step": 819, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1538, + "grad_norm": 1.4609375, + "learning_rate": 2.009671964192858e-05, + "long_answer_loss": 0.1538, + "loss": 0.1589, + "short_answer_loss": NaN, + "step": 820, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1506, + "grad_norm": 1.3984375, + "learning_rate": 2.008442145822448e-05, + "long_answer_loss": 0.1506, + "loss": 0.1513, + "short_answer_loss": NaN, + "step": 821, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1531, + "grad_norm": 1.5078125, + "learning_rate": 2.007211164448024e-05, + "long_answer_loss": 0.1531, + "loss": 0.1491, + "short_answer_loss": NaN, + "step": 822, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1577, + "grad_norm": 1.3828125, + "learning_rate": 2.0059790219571872e-05, + "long_answer_loss": 0.1577, + "loss": 0.1578, + "short_answer_loss": NaN, + "step": 823, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1292, + "grad_norm": 1.296875, + "learning_rate": 2.004745720239319e-05, + "long_answer_loss": 0.1292, + "loss": 0.1488, + "short_answer_loss": NaN, + "step": 824, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1581, + "grad_norm": 1.40625, + "learning_rate": 2.0035112611855784e-05, + "long_answer_loss": 0.1581, + "loss": 0.1513, + "short_answer_loss": NaN, + "step": 825, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.177, + "grad_norm": 1.4296875, + "learning_rate": 2.0022756466888996e-05, + "long_answer_loss": 0.177, + "loss": 0.1508, + "short_answer_loss": NaN, + "step": 826, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1555, + "grad_norm": 1.40625, + "learning_rate": 2.001038878643988e-05, + "long_answer_loss": 0.1555, + "loss": 0.1525, + "short_answer_loss": NaN, + "step": 827, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1664, + "grad_norm": 1.40625, + "learning_rate": 1.999800958947318e-05, + "long_answer_loss": 0.1664, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 828, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1548, + "grad_norm": 1.515625, + "learning_rate": 1.998561889497131e-05, + "long_answer_loss": 0.1548, + "loss": 0.1596, + "short_answer_loss": NaN, + "step": 829, + "template_loss": 0.0 + }, + { + "epoch": 0.63, + "full_loss": 0.1783, + "grad_norm": 1.53125, + "learning_rate": 1.9973216721934296e-05, + "long_answer_loss": 0.1783, + "loss": 0.1595, + "short_answer_loss": NaN, + "step": 830, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1466, + "grad_norm": 1.3046875, + "learning_rate": 1.9960803089379776e-05, + "long_answer_loss": 0.1466, + "loss": 0.1391, + "short_answer_loss": NaN, + "step": 831, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1424, + "grad_norm": 1.4921875, + "learning_rate": 1.9948378016342962e-05, + "long_answer_loss": 0.1424, + "loss": 0.1517, + "short_answer_loss": NaN, + "step": 832, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1493, + "grad_norm": 1.4609375, + "learning_rate": 1.99359415218766e-05, + "long_answer_loss": 0.1493, + "loss": 0.1546, + "short_answer_loss": NaN, + "step": 833, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1591, + "grad_norm": 1.3828125, + "learning_rate": 1.992349362505096e-05, + "long_answer_loss": 0.1591, + "loss": 0.1513, + "short_answer_loss": NaN, + "step": 834, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1555, + "grad_norm": 1.453125, + "learning_rate": 1.991103434495379e-05, + "long_answer_loss": 0.1555, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 835, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1568, + "grad_norm": 1.4140625, + "learning_rate": 1.9898563700690298e-05, + "long_answer_loss": 0.1568, + "loss": 0.1441, + "short_answer_loss": NaN, + "step": 836, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1671, + "grad_norm": 1.3125, + "learning_rate": 1.9886081711383108e-05, + "long_answer_loss": 0.1671, + "loss": 0.1434, + "short_answer_loss": NaN, + "step": 837, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1579, + "grad_norm": 1.3828125, + "learning_rate": 1.9873588396172257e-05, + "long_answer_loss": 0.1579, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 838, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1626, + "grad_norm": 1.46875, + "learning_rate": 1.9861083774215133e-05, + "long_answer_loss": 0.1626, + "loss": 0.1518, + "short_answer_loss": NaN, + "step": 839, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1446, + "grad_norm": 1.4296875, + "learning_rate": 1.9848567864686474e-05, + "long_answer_loss": 0.1446, + "loss": 0.1511, + "short_answer_loss": NaN, + "step": 840, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1301, + "grad_norm": 1.5, + "learning_rate": 1.9836040686778316e-05, + "long_answer_loss": 0.1301, + "loss": 0.1459, + "short_answer_loss": NaN, + "step": 841, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1551, + "grad_norm": 1.421875, + "learning_rate": 1.982350225969998e-05, + "long_answer_loss": 0.1551, + "loss": 0.1445, + "short_answer_loss": NaN, + "step": 842, + "template_loss": 0.0 + }, + { + "epoch": 0.64, + "full_loss": 0.1423, + "grad_norm": 1.46875, + "learning_rate": 1.981095260267804e-05, + "long_answer_loss": 0.1423, + "loss": 0.1449, + "short_answer_loss": NaN, + "step": 843, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1767, + "grad_norm": 1.453125, + "learning_rate": 1.9798391734956284e-05, + "long_answer_loss": 0.1767, + "loss": 0.1573, + "short_answer_loss": NaN, + "step": 844, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1634, + "grad_norm": 1.5625, + "learning_rate": 1.9785819675795698e-05, + "long_answer_loss": 0.1634, + "loss": 0.1554, + "short_answer_loss": NaN, + "step": 845, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1774, + "grad_norm": 1.3671875, + "learning_rate": 1.9773236444474414e-05, + "long_answer_loss": 0.1774, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 846, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1488, + "grad_norm": 1.2890625, + "learning_rate": 1.976064206028771e-05, + "long_answer_loss": 0.1488, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 847, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1502, + "grad_norm": 1.4140625, + "learning_rate": 1.974803654254796e-05, + "long_answer_loss": 0.1502, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 848, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1375, + "grad_norm": 1.375, + "learning_rate": 1.9735419910584616e-05, + "long_answer_loss": 0.1375, + "loss": 0.1568, + "short_answer_loss": NaN, + "step": 849, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1344, + "grad_norm": 1.4765625, + "learning_rate": 1.9722792183744162e-05, + "long_answer_loss": 0.1344, + "loss": 0.1525, + "short_answer_loss": NaN, + "step": 850, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1453, + "grad_norm": 1.4140625, + "learning_rate": 1.9710153381390108e-05, + "long_answer_loss": 0.1453, + "loss": 0.151, + "short_answer_loss": NaN, + "step": 851, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1454, + "grad_norm": 1.3515625, + "learning_rate": 1.9697503522902936e-05, + "long_answer_loss": 0.1454, + "loss": 0.1489, + "short_answer_loss": NaN, + "step": 852, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1466, + "grad_norm": 1.578125, + "learning_rate": 1.9684842627680088e-05, + "long_answer_loss": 0.1466, + "loss": 0.1543, + "short_answer_loss": NaN, + "step": 853, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1528, + "grad_norm": 1.28125, + "learning_rate": 1.9672170715135927e-05, + "long_answer_loss": 0.1528, + "loss": 0.1467, + "short_answer_loss": NaN, + "step": 854, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1578, + "grad_norm": 1.34375, + "learning_rate": 1.965948780470171e-05, + "long_answer_loss": 0.1578, + "loss": 0.1475, + "short_answer_loss": NaN, + "step": 855, + "template_loss": 0.0 + }, + { + "epoch": 0.65, + "full_loss": 0.1392, + "grad_norm": 1.4609375, + "learning_rate": 1.964679391582557e-05, + "long_answer_loss": 0.1392, + "loss": 0.1542, + "short_answer_loss": NaN, + "step": 856, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1529, + "grad_norm": 1.3359375, + "learning_rate": 1.9634089067972445e-05, + "long_answer_loss": 0.1529, + "loss": 0.1483, + "short_answer_loss": NaN, + "step": 857, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1587, + "grad_norm": 1.359375, + "learning_rate": 1.962137328062411e-05, + "long_answer_loss": 0.1587, + "loss": 0.1532, + "short_answer_loss": NaN, + "step": 858, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1671, + "grad_norm": 1.5078125, + "learning_rate": 1.9608646573279098e-05, + "long_answer_loss": 0.1671, + "loss": 0.1638, + "short_answer_loss": NaN, + "step": 859, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1444, + "grad_norm": 1.375, + "learning_rate": 1.9595908965452692e-05, + "long_answer_loss": 0.1444, + "loss": 0.1475, + "short_answer_loss": NaN, + "step": 860, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1472, + "grad_norm": 1.359375, + "learning_rate": 1.9583160476676885e-05, + "long_answer_loss": 0.1472, + "loss": 0.1566, + "short_answer_loss": NaN, + "step": 861, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1496, + "grad_norm": 1.40625, + "learning_rate": 1.957040112650036e-05, + "long_answer_loss": 0.1496, + "loss": 0.156, + "short_answer_loss": NaN, + "step": 862, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1416, + "grad_norm": 1.359375, + "learning_rate": 1.955763093448845e-05, + "long_answer_loss": 0.1416, + "loss": 0.1537, + "short_answer_loss": NaN, + "step": 863, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1567, + "grad_norm": 1.3828125, + "learning_rate": 1.9544849920223123e-05, + "long_answer_loss": 0.1567, + "loss": 0.1499, + "short_answer_loss": NaN, + "step": 864, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1551, + "grad_norm": 1.3828125, + "learning_rate": 1.953205810330293e-05, + "long_answer_loss": 0.1551, + "loss": 0.154, + "short_answer_loss": NaN, + "step": 865, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1438, + "grad_norm": 1.4140625, + "learning_rate": 1.951925550334299e-05, + "long_answer_loss": 0.1438, + "loss": 0.1523, + "short_answer_loss": NaN, + "step": 866, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1265, + "grad_norm": 1.4765625, + "learning_rate": 1.950644213997496e-05, + "long_answer_loss": 0.1265, + "loss": 0.148, + "short_answer_loss": NaN, + "step": 867, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.1521, + "grad_norm": 1.5, + "learning_rate": 1.949361803284701e-05, + "long_answer_loss": 0.1521, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 868, + "template_loss": 0.0 + }, + { + "epoch": 0.66, + "full_loss": 0.156, + "grad_norm": 1.4296875, + "learning_rate": 1.948078320162376e-05, + "long_answer_loss": 0.156, + "loss": 0.1512, + "short_answer_loss": NaN, + "step": 869, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1356, + "grad_norm": 1.484375, + "learning_rate": 1.94679376659863e-05, + "long_answer_loss": 0.1356, + "loss": 0.1499, + "short_answer_loss": NaN, + "step": 870, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1411, + "grad_norm": 1.46875, + "learning_rate": 1.945508144563212e-05, + "long_answer_loss": 0.1411, + "loss": 0.1558, + "short_answer_loss": NaN, + "step": 871, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1385, + "grad_norm": 1.4296875, + "learning_rate": 1.9442214560275096e-05, + "long_answer_loss": 0.1385, + "loss": 0.1422, + "short_answer_loss": NaN, + "step": 872, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1638, + "grad_norm": 1.421875, + "learning_rate": 1.9429337029645464e-05, + "long_answer_loss": 0.1638, + "loss": 0.1554, + "short_answer_loss": NaN, + "step": 873, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1393, + "grad_norm": 1.3984375, + "learning_rate": 1.9416448873489775e-05, + "long_answer_loss": 0.1393, + "loss": 0.1542, + "short_answer_loss": NaN, + "step": 874, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1456, + "grad_norm": 1.40625, + "learning_rate": 1.9403550111570883e-05, + "long_answer_loss": 0.1456, + "loss": 0.1492, + "short_answer_loss": NaN, + "step": 875, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1548, + "grad_norm": 1.390625, + "learning_rate": 1.93906407636679e-05, + "long_answer_loss": 0.1548, + "loss": 0.1585, + "short_answer_loss": NaN, + "step": 876, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1319, + "grad_norm": 1.3359375, + "learning_rate": 1.9377720849576164e-05, + "long_answer_loss": 0.1319, + "loss": 0.1454, + "short_answer_loss": NaN, + "step": 877, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.151, + "grad_norm": 1.453125, + "learning_rate": 1.9364790389107224e-05, + "long_answer_loss": 0.151, + "loss": 0.1583, + "short_answer_loss": NaN, + "step": 878, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1422, + "grad_norm": 1.4375, + "learning_rate": 1.93518494020888e-05, + "long_answer_loss": 0.1422, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 879, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1457, + "grad_norm": 1.3828125, + "learning_rate": 1.933889790836475e-05, + "long_answer_loss": 0.1457, + "loss": 0.1545, + "short_answer_loss": NaN, + "step": 880, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1346, + "grad_norm": 1.421875, + "learning_rate": 1.9325935927795052e-05, + "long_answer_loss": 0.1346, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 881, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1691, + "grad_norm": 1.4609375, + "learning_rate": 1.9312963480255746e-05, + "long_answer_loss": 0.1691, + "loss": 0.1523, + "short_answer_loss": NaN, + "step": 882, + "template_loss": 0.0 + }, + { + "epoch": 0.67, + "full_loss": 0.1604, + "grad_norm": 1.3359375, + "learning_rate": 1.9299980585638946e-05, + "long_answer_loss": 0.1604, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 883, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1416, + "grad_norm": 1.5078125, + "learning_rate": 1.9286987263852767e-05, + "long_answer_loss": 0.1416, + "loss": 0.1491, + "short_answer_loss": NaN, + "step": 884, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1696, + "grad_norm": 1.453125, + "learning_rate": 1.927398353482132e-05, + "long_answer_loss": 0.1696, + "loss": 0.1525, + "short_answer_loss": NaN, + "step": 885, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1413, + "grad_norm": 1.3828125, + "learning_rate": 1.9260969418484677e-05, + "long_answer_loss": 0.1413, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 886, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1525, + "grad_norm": 1.453125, + "learning_rate": 1.9247944934798835e-05, + "long_answer_loss": 0.1525, + "loss": 0.1508, + "short_answer_loss": NaN, + "step": 887, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1834, + "grad_norm": 1.4140625, + "learning_rate": 1.9234910103735686e-05, + "long_answer_loss": 0.1834, + "loss": 0.1527, + "short_answer_loss": NaN, + "step": 888, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1449, + "grad_norm": 1.3984375, + "learning_rate": 1.9221864945282997e-05, + "long_answer_loss": 0.1449, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 889, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1531, + "grad_norm": 1.4140625, + "learning_rate": 1.920880947944436e-05, + "long_answer_loss": 0.1531, + "loss": 0.149, + "short_answer_loss": NaN, + "step": 890, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1307, + "grad_norm": 1.3984375, + "learning_rate": 1.919574372623918e-05, + "long_answer_loss": 0.1307, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 891, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1422, + "grad_norm": 1.3671875, + "learning_rate": 1.918266770570264e-05, + "long_answer_loss": 0.1422, + "loss": 0.1524, + "short_answer_loss": NaN, + "step": 892, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1324, + "grad_norm": 1.4609375, + "learning_rate": 1.9169581437885654e-05, + "long_answer_loss": 0.1324, + "loss": 0.1496, + "short_answer_loss": NaN, + "step": 893, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1687, + "grad_norm": 1.421875, + "learning_rate": 1.915648494285486e-05, + "long_answer_loss": 0.1687, + "loss": 0.1525, + "short_answer_loss": NaN, + "step": 894, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1571, + "grad_norm": 1.3515625, + "learning_rate": 1.9143378240692578e-05, + "long_answer_loss": 0.1571, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 895, + "template_loss": 0.0 + }, + { + "epoch": 0.68, + "full_loss": 0.1396, + "grad_norm": 1.4453125, + "learning_rate": 1.913026135149678e-05, + "long_answer_loss": 0.1396, + "loss": 0.1541, + "short_answer_loss": NaN, + "step": 896, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1529, + "grad_norm": 1.40625, + "learning_rate": 1.9117134295381056e-05, + "long_answer_loss": 0.1529, + "loss": 0.1468, + "short_answer_loss": NaN, + "step": 897, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1445, + "grad_norm": 1.390625, + "learning_rate": 1.910399709247458e-05, + "long_answer_loss": 0.1445, + "loss": 0.1454, + "short_answer_loss": NaN, + "step": 898, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1334, + "grad_norm": 1.4296875, + "learning_rate": 1.90908497629221e-05, + "long_answer_loss": 0.1334, + "loss": 0.1533, + "short_answer_loss": NaN, + "step": 899, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.133, + "grad_norm": 1.4453125, + "learning_rate": 1.9077692326883872e-05, + "long_answer_loss": 0.133, + "loss": 0.1476, + "short_answer_loss": NaN, + "step": 900, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1569, + "grad_norm": 1.4296875, + "learning_rate": 1.9064524804535674e-05, + "long_answer_loss": 0.1569, + "loss": 0.1481, + "short_answer_loss": NaN, + "step": 901, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1252, + "grad_norm": 1.390625, + "learning_rate": 1.9051347216068734e-05, + "long_answer_loss": 0.1252, + "loss": 0.1479, + "short_answer_loss": NaN, + "step": 902, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.141, + "grad_norm": 1.4921875, + "learning_rate": 1.903815958168972e-05, + "long_answer_loss": 0.141, + "loss": 0.1476, + "short_answer_loss": NaN, + "step": 903, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1579, + "grad_norm": 1.421875, + "learning_rate": 1.9024961921620705e-05, + "long_answer_loss": 0.1579, + "loss": 0.1512, + "short_answer_loss": NaN, + "step": 904, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.149, + "grad_norm": 1.3671875, + "learning_rate": 1.9011754256099128e-05, + "long_answer_loss": 0.149, + "loss": 0.1504, + "short_answer_loss": NaN, + "step": 905, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1573, + "grad_norm": 1.53125, + "learning_rate": 1.8998536605377788e-05, + "long_answer_loss": 0.1573, + "loss": 0.1488, + "short_answer_loss": NaN, + "step": 906, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.139, + "grad_norm": 1.4765625, + "learning_rate": 1.8985308989724776e-05, + "long_answer_loss": 0.139, + "loss": 0.1501, + "short_answer_loss": NaN, + "step": 907, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.1844, + "grad_norm": 1.46875, + "learning_rate": 1.8972071429423473e-05, + "long_answer_loss": 0.1844, + "loss": 0.1444, + "short_answer_loss": NaN, + "step": 908, + "template_loss": 0.0 + }, + { + "epoch": 0.69, + "full_loss": 0.157, + "grad_norm": 1.4921875, + "learning_rate": 1.8958823944772508e-05, + "long_answer_loss": 0.157, + "loss": 0.1546, + "short_answer_loss": NaN, + "step": 909, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1433, + "grad_norm": 1.5546875, + "learning_rate": 1.894556655608573e-05, + "long_answer_loss": 0.1433, + "loss": 0.1423, + "short_answer_loss": NaN, + "step": 910, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1477, + "grad_norm": 1.3984375, + "learning_rate": 1.8932299283692177e-05, + "long_answer_loss": 0.1477, + "loss": 0.1412, + "short_answer_loss": NaN, + "step": 911, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1523, + "grad_norm": 1.421875, + "learning_rate": 1.891902214793603e-05, + "long_answer_loss": 0.1523, + "loss": 0.1482, + "short_answer_loss": NaN, + "step": 912, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1541, + "grad_norm": 1.46875, + "learning_rate": 1.890573516917661e-05, + "long_answer_loss": 0.1541, + "loss": 0.1517, + "short_answer_loss": NaN, + "step": 913, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1398, + "grad_norm": 1.4453125, + "learning_rate": 1.889243836778832e-05, + "long_answer_loss": 0.1398, + "loss": 0.1453, + "short_answer_loss": NaN, + "step": 914, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1336, + "grad_norm": 1.296875, + "learning_rate": 1.8879131764160635e-05, + "long_answer_loss": 0.1336, + "loss": 0.1479, + "short_answer_loss": NaN, + "step": 915, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1259, + "grad_norm": 1.34375, + "learning_rate": 1.8865815378698052e-05, + "long_answer_loss": 0.1259, + "loss": 0.1437, + "short_answer_loss": NaN, + "step": 916, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1738, + "grad_norm": 1.453125, + "learning_rate": 1.8852489231820076e-05, + "long_answer_loss": 0.1738, + "loss": 0.1458, + "short_answer_loss": NaN, + "step": 917, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1643, + "grad_norm": 1.4296875, + "learning_rate": 1.883915334396117e-05, + "long_answer_loss": 0.1643, + "loss": 0.158, + "short_answer_loss": NaN, + "step": 918, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1383, + "grad_norm": 1.4453125, + "learning_rate": 1.8825807735570748e-05, + "long_answer_loss": 0.1383, + "loss": 0.1573, + "short_answer_loss": NaN, + "step": 919, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1543, + "grad_norm": 1.5078125, + "learning_rate": 1.881245242711311e-05, + "long_answer_loss": 0.1543, + "loss": 0.152, + "short_answer_loss": NaN, + "step": 920, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1397, + "grad_norm": 1.3671875, + "learning_rate": 1.879908743906745e-05, + "long_answer_loss": 0.1397, + "loss": 0.1497, + "short_answer_loss": NaN, + "step": 921, + "template_loss": 0.0 + }, + { + "epoch": 0.7, + "full_loss": 0.1374, + "grad_norm": 1.40625, + "learning_rate": 1.878571279192779e-05, + "long_answer_loss": 0.1374, + "loss": 0.1483, + "short_answer_loss": NaN, + "step": 922, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1486, + "grad_norm": 1.40625, + "learning_rate": 1.8772328506202972e-05, + "long_answer_loss": 0.1486, + "loss": 0.1461, + "short_answer_loss": NaN, + "step": 923, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1676, + "grad_norm": 1.4296875, + "learning_rate": 1.8758934602416623e-05, + "long_answer_loss": 0.1676, + "loss": 0.1456, + "short_answer_loss": NaN, + "step": 924, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1361, + "grad_norm": 1.359375, + "learning_rate": 1.8745531101107104e-05, + "long_answer_loss": 0.1361, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 925, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1458, + "grad_norm": 1.390625, + "learning_rate": 1.87321180228275e-05, + "long_answer_loss": 0.1458, + "loss": 0.1402, + "short_answer_loss": NaN, + "step": 926, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1766, + "grad_norm": 1.3984375, + "learning_rate": 1.871869538814558e-05, + "long_answer_loss": 0.1766, + "loss": 0.1544, + "short_answer_loss": NaN, + "step": 927, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1668, + "grad_norm": 1.40625, + "learning_rate": 1.870526321764377e-05, + "long_answer_loss": 0.1668, + "loss": 0.1472, + "short_answer_loss": NaN, + "step": 928, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1488, + "grad_norm": 1.4609375, + "learning_rate": 1.8691821531919117e-05, + "long_answer_loss": 0.1488, + "loss": 0.1526, + "short_answer_loss": NaN, + "step": 929, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.156, + "grad_norm": 1.4609375, + "learning_rate": 1.8678370351583256e-05, + "long_answer_loss": 0.156, + "loss": 0.1519, + "short_answer_loss": NaN, + "step": 930, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1304, + "grad_norm": 1.4921875, + "learning_rate": 1.866490969726239e-05, + "long_answer_loss": 0.1304, + "loss": 0.1449, + "short_answer_loss": NaN, + "step": 931, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1553, + "grad_norm": 1.46875, + "learning_rate": 1.8651439589597235e-05, + "long_answer_loss": 0.1553, + "loss": 0.1516, + "short_answer_loss": NaN, + "step": 932, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1225, + "grad_norm": 1.3828125, + "learning_rate": 1.8637960049243013e-05, + "long_answer_loss": 0.1225, + "loss": 0.1434, + "short_answer_loss": NaN, + "step": 933, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1371, + "grad_norm": 1.4453125, + "learning_rate": 1.8624471096869417e-05, + "long_answer_loss": 0.1371, + "loss": 0.1507, + "short_answer_loss": NaN, + "step": 934, + "template_loss": 0.0 + }, + { + "epoch": 0.71, + "full_loss": 0.1215, + "grad_norm": 1.3828125, + "learning_rate": 1.861097275316055e-05, + "long_answer_loss": 0.1215, + "loss": 0.1485, + "short_answer_loss": NaN, + "step": 935, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.133, + "grad_norm": 1.3203125, + "learning_rate": 1.8597465038814936e-05, + "long_answer_loss": 0.133, + "loss": 0.1495, + "short_answer_loss": NaN, + "step": 936, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1611, + "grad_norm": 1.4453125, + "learning_rate": 1.8583947974545462e-05, + "long_answer_loss": 0.1611, + "loss": 0.152, + "short_answer_loss": NaN, + "step": 937, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.144, + "grad_norm": 1.4453125, + "learning_rate": 1.857042158107935e-05, + "long_answer_loss": 0.144, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 938, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1488, + "grad_norm": 1.3671875, + "learning_rate": 1.855688587915813e-05, + "long_answer_loss": 0.1488, + "loss": 0.1432, + "short_answer_loss": NaN, + "step": 939, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1342, + "grad_norm": 1.390625, + "learning_rate": 1.85433408895376e-05, + "long_answer_loss": 0.1342, + "loss": 0.1423, + "short_answer_loss": NaN, + "step": 940, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1574, + "grad_norm": 1.421875, + "learning_rate": 1.8529786632987815e-05, + "long_answer_loss": 0.1574, + "loss": 0.1471, + "short_answer_loss": NaN, + "step": 941, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1266, + "grad_norm": 1.4296875, + "learning_rate": 1.8516223130293024e-05, + "long_answer_loss": 0.1266, + "loss": 0.1372, + "short_answer_loss": NaN, + "step": 942, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1552, + "grad_norm": 1.5, + "learning_rate": 1.850265040225166e-05, + "long_answer_loss": 0.1552, + "loss": 0.1581, + "short_answer_loss": NaN, + "step": 943, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1505, + "grad_norm": 1.421875, + "learning_rate": 1.8489068469676298e-05, + "long_answer_loss": 0.1505, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 944, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1384, + "grad_norm": 1.3984375, + "learning_rate": 1.8475477353393635e-05, + "long_answer_loss": 0.1384, + "loss": 0.1415, + "short_answer_loss": NaN, + "step": 945, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1615, + "grad_norm": 1.578125, + "learning_rate": 1.846187707424445e-05, + "long_answer_loss": 0.1615, + "loss": 0.1433, + "short_answer_loss": NaN, + "step": 946, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1301, + "grad_norm": 1.4453125, + "learning_rate": 1.844826765308357e-05, + "long_answer_loss": 0.1301, + "loss": 0.1417, + "short_answer_loss": NaN, + "step": 947, + "template_loss": 0.0 + }, + { + "epoch": 0.72, + "full_loss": 0.1422, + "grad_norm": 1.3828125, + "learning_rate": 1.8434649110779833e-05, + "long_answer_loss": 0.1422, + "loss": 0.1424, + "short_answer_loss": NaN, + "step": 948, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1435, + "grad_norm": 1.4140625, + "learning_rate": 1.8421021468216075e-05, + "long_answer_loss": 0.1435, + "loss": 0.1419, + "short_answer_loss": NaN, + "step": 949, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.158, + "grad_norm": 1.515625, + "learning_rate": 1.8407384746289084e-05, + "long_answer_loss": 0.158, + "loss": 0.1517, + "short_answer_loss": NaN, + "step": 950, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1334, + "grad_norm": 1.4140625, + "learning_rate": 1.839373896590956e-05, + "long_answer_loss": 0.1334, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 951, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.158, + "grad_norm": 1.4921875, + "learning_rate": 1.8380084148002104e-05, + "long_answer_loss": 0.158, + "loss": 0.1474, + "short_answer_loss": NaN, + "step": 952, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1498, + "grad_norm": 1.3984375, + "learning_rate": 1.8366420313505182e-05, + "long_answer_loss": 0.1498, + "loss": 0.1438, + "short_answer_loss": NaN, + "step": 953, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1349, + "grad_norm": 1.5078125, + "learning_rate": 1.8352747483371064e-05, + "long_answer_loss": 0.1349, + "loss": 0.1529, + "short_answer_loss": NaN, + "step": 954, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1247, + "grad_norm": 1.4921875, + "learning_rate": 1.8339065678565835e-05, + "long_answer_loss": 0.1247, + "loss": 0.1535, + "short_answer_loss": NaN, + "step": 955, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1573, + "grad_norm": 1.3828125, + "learning_rate": 1.8325374920069333e-05, + "long_answer_loss": 0.1573, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 956, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1422, + "grad_norm": 1.578125, + "learning_rate": 1.831167522887512e-05, + "long_answer_loss": 0.1422, + "loss": 0.1534, + "short_answer_loss": NaN, + "step": 957, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1438, + "grad_norm": 1.5859375, + "learning_rate": 1.8297966625990474e-05, + "long_answer_loss": 0.1438, + "loss": 0.1542, + "short_answer_loss": NaN, + "step": 958, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1689, + "grad_norm": 1.6015625, + "learning_rate": 1.8284249132436316e-05, + "long_answer_loss": 0.1689, + "loss": 0.1487, + "short_answer_loss": NaN, + "step": 959, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1494, + "grad_norm": 1.5390625, + "learning_rate": 1.8270522769247212e-05, + "long_answer_loss": 0.1494, + "loss": 0.1537, + "short_answer_loss": NaN, + "step": 960, + "template_loss": 0.0 + }, + { + "epoch": 0.73, + "full_loss": 0.1626, + "grad_norm": 1.4921875, + "learning_rate": 1.8256787557471328e-05, + "long_answer_loss": 0.1626, + "loss": 0.1569, + "short_answer_loss": NaN, + "step": 961, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1434, + "grad_norm": 1.4140625, + "learning_rate": 1.8243043518170395e-05, + "long_answer_loss": 0.1434, + "loss": 0.1516, + "short_answer_loss": NaN, + "step": 962, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1507, + "grad_norm": 1.4921875, + "learning_rate": 1.822929067241969e-05, + "long_answer_loss": 0.1507, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 963, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1427, + "grad_norm": 1.453125, + "learning_rate": 1.8215529041307982e-05, + "long_answer_loss": 0.1427, + "loss": 0.143, + "short_answer_loss": NaN, + "step": 964, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1768, + "grad_norm": 1.4921875, + "learning_rate": 1.8201758645937518e-05, + "long_answer_loss": 0.1768, + "loss": 0.1545, + "short_answer_loss": NaN, + "step": 965, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1529, + "grad_norm": 1.390625, + "learning_rate": 1.818797950742398e-05, + "long_answer_loss": 0.1529, + "loss": 0.1445, + "short_answer_loss": NaN, + "step": 966, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1398, + "grad_norm": 1.4296875, + "learning_rate": 1.817419164689646e-05, + "long_answer_loss": 0.1398, + "loss": 0.1457, + "short_answer_loss": NaN, + "step": 967, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1818, + "grad_norm": 1.3359375, + "learning_rate": 1.8160395085497428e-05, + "long_answer_loss": 0.1818, + "loss": 0.1415, + "short_answer_loss": NaN, + "step": 968, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1445, + "grad_norm": 1.4765625, + "learning_rate": 1.8146589844382686e-05, + "long_answer_loss": 0.1445, + "loss": 0.1487, + "short_answer_loss": NaN, + "step": 969, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1365, + "grad_norm": 1.34375, + "learning_rate": 1.8132775944721354e-05, + "long_answer_loss": 0.1365, + "loss": 0.1388, + "short_answer_loss": NaN, + "step": 970, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1537, + "grad_norm": 1.3828125, + "learning_rate": 1.8118953407695825e-05, + "long_answer_loss": 0.1537, + "loss": 0.1434, + "short_answer_loss": NaN, + "step": 971, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1437, + "grad_norm": 1.3671875, + "learning_rate": 1.8105122254501743e-05, + "long_answer_loss": 0.1437, + "loss": 0.1481, + "short_answer_loss": NaN, + "step": 972, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1409, + "grad_norm": 1.4765625, + "learning_rate": 1.8091282506347952e-05, + "long_answer_loss": 0.1409, + "loss": 0.1514, + "short_answer_loss": NaN, + "step": 973, + "template_loss": 0.0 + }, + { + "epoch": 0.74, + "full_loss": 0.1473, + "grad_norm": 1.4140625, + "learning_rate": 1.807743418445649e-05, + "long_answer_loss": 0.1473, + "loss": 0.1485, + "short_answer_loss": NaN, + "step": 974, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1855, + "grad_norm": 1.3515625, + "learning_rate": 1.8063577310062527e-05, + "long_answer_loss": 0.1855, + "loss": 0.1464, + "short_answer_loss": NaN, + "step": 975, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1422, + "grad_norm": 1.3671875, + "learning_rate": 1.8049711904414362e-05, + "long_answer_loss": 0.1422, + "loss": 0.1484, + "short_answer_loss": NaN, + "step": 976, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1332, + "grad_norm": 1.4609375, + "learning_rate": 1.803583798877337e-05, + "long_answer_loss": 0.1332, + "loss": 0.1376, + "short_answer_loss": NaN, + "step": 977, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1391, + "grad_norm": 1.5234375, + "learning_rate": 1.802195558441397e-05, + "long_answer_loss": 0.1391, + "loss": 0.1475, + "short_answer_loss": NaN, + "step": 978, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1537, + "grad_norm": 1.359375, + "learning_rate": 1.8008064712623607e-05, + "long_answer_loss": 0.1537, + "loss": 0.1367, + "short_answer_loss": NaN, + "step": 979, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1691, + "grad_norm": 1.3125, + "learning_rate": 1.7994165394702705e-05, + "long_answer_loss": 0.1691, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 980, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1577, + "grad_norm": 1.421875, + "learning_rate": 1.7980257651964634e-05, + "long_answer_loss": 0.1577, + "loss": 0.149, + "short_answer_loss": NaN, + "step": 981, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1244, + "grad_norm": 1.484375, + "learning_rate": 1.7966341505735695e-05, + "long_answer_loss": 0.1244, + "loss": 0.1453, + "short_answer_loss": NaN, + "step": 982, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1265, + "grad_norm": 1.3984375, + "learning_rate": 1.7952416977355063e-05, + "long_answer_loss": 0.1265, + "loss": 0.1491, + "short_answer_loss": NaN, + "step": 983, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1647, + "grad_norm": 1.3984375, + "learning_rate": 1.793848408817478e-05, + "long_answer_loss": 0.1647, + "loss": 0.1433, + "short_answer_loss": NaN, + "step": 984, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1331, + "grad_norm": 1.40625, + "learning_rate": 1.792454285955969e-05, + "long_answer_loss": 0.1331, + "loss": 0.1418, + "short_answer_loss": NaN, + "step": 985, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1528, + "grad_norm": 1.5078125, + "learning_rate": 1.7910593312887447e-05, + "long_answer_loss": 0.1528, + "loss": 0.1486, + "short_answer_loss": NaN, + "step": 986, + "template_loss": 0.0 + }, + { + "epoch": 0.75, + "full_loss": 0.1451, + "grad_norm": 1.4765625, + "learning_rate": 1.7896635469548438e-05, + "long_answer_loss": 0.1451, + "loss": 0.1495, + "short_answer_loss": NaN, + "step": 987, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1485, + "grad_norm": 1.3671875, + "learning_rate": 1.7882669350945787e-05, + "long_answer_loss": 0.1485, + "loss": 0.1357, + "short_answer_loss": NaN, + "step": 988, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1562, + "grad_norm": 1.359375, + "learning_rate": 1.7868694978495304e-05, + "long_answer_loss": 0.1562, + "loss": 0.1411, + "short_answer_loss": NaN, + "step": 989, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.13, + "grad_norm": 1.484375, + "learning_rate": 1.785471237362545e-05, + "long_answer_loss": 0.13, + "loss": 0.1494, + "short_answer_loss": NaN, + "step": 990, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1363, + "grad_norm": 1.2734375, + "learning_rate": 1.784072155777732e-05, + "long_answer_loss": 0.1363, + "loss": 0.1417, + "short_answer_loss": NaN, + "step": 991, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1632, + "grad_norm": 1.4140625, + "learning_rate": 1.782672255240459e-05, + "long_answer_loss": 0.1632, + "loss": 0.1447, + "short_answer_loss": NaN, + "step": 992, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1759, + "grad_norm": 1.4296875, + "learning_rate": 1.7812715378973495e-05, + "long_answer_loss": 0.1759, + "loss": 0.1479, + "short_answer_loss": NaN, + "step": 993, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1529, + "grad_norm": 1.4375, + "learning_rate": 1.7798700058962807e-05, + "long_answer_loss": 0.1529, + "loss": 0.1548, + "short_answer_loss": NaN, + "step": 994, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1382, + "grad_norm": 1.4140625, + "learning_rate": 1.778467661386377e-05, + "long_answer_loss": 0.1382, + "loss": 0.1413, + "short_answer_loss": NaN, + "step": 995, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1495, + "grad_norm": 1.5390625, + "learning_rate": 1.7770645065180106e-05, + "long_answer_loss": 0.1495, + "loss": 0.1493, + "short_answer_loss": NaN, + "step": 996, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1435, + "grad_norm": 1.390625, + "learning_rate": 1.7756605434427948e-05, + "long_answer_loss": 0.1435, + "loss": 0.1409, + "short_answer_loss": NaN, + "step": 997, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.161, + "grad_norm": 1.453125, + "learning_rate": 1.7742557743135836e-05, + "long_answer_loss": 0.161, + "loss": 0.1435, + "short_answer_loss": NaN, + "step": 998, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1362, + "grad_norm": 1.375, + "learning_rate": 1.7728502012844665e-05, + "long_answer_loss": 0.1362, + "loss": 0.1368, + "short_answer_loss": NaN, + "step": 999, + "template_loss": 0.0 + }, + { + "epoch": 0.76, + "full_loss": 0.1227, + "grad_norm": 1.3515625, + "learning_rate": 1.7714438265107643e-05, + "long_answer_loss": 0.1227, + "loss": 0.1407, + "short_answer_loss": NaN, + "step": 1000, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1565, + "grad_norm": 1.4609375, + "learning_rate": 1.7700366521490296e-05, + "long_answer_loss": 0.1565, + "loss": 0.1506, + "short_answer_loss": NaN, + "step": 1001, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1356, + "grad_norm": 1.3984375, + "learning_rate": 1.7686286803570398e-05, + "long_answer_loss": 0.1356, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 1002, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.142, + "grad_norm": 1.34375, + "learning_rate": 1.767219913293795e-05, + "long_answer_loss": 0.142, + "loss": 0.1409, + "short_answer_loss": NaN, + "step": 1003, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1271, + "grad_norm": 1.5, + "learning_rate": 1.765810353119515e-05, + "long_answer_loss": 0.1271, + "loss": 0.1491, + "short_answer_loss": NaN, + "step": 1004, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1491, + "grad_norm": 1.375, + "learning_rate": 1.7644000019956353e-05, + "long_answer_loss": 0.1491, + "loss": 0.1515, + "short_answer_loss": NaN, + "step": 1005, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1404, + "grad_norm": 1.46875, + "learning_rate": 1.7629888620848055e-05, + "long_answer_loss": 0.1404, + "loss": 0.1449, + "short_answer_loss": NaN, + "step": 1006, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.15, + "grad_norm": 1.4140625, + "learning_rate": 1.761576935550884e-05, + "long_answer_loss": 0.15, + "loss": 0.1476, + "short_answer_loss": NaN, + "step": 1007, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.122, + "grad_norm": 1.421875, + "learning_rate": 1.760164224558935e-05, + "long_answer_loss": 0.122, + "loss": 0.145, + "short_answer_loss": NaN, + "step": 1008, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1541, + "grad_norm": 1.453125, + "learning_rate": 1.7587507312752262e-05, + "long_answer_loss": 0.1541, + "loss": 0.1385, + "short_answer_loss": NaN, + "step": 1009, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1573, + "grad_norm": 1.4921875, + "learning_rate": 1.7573364578672244e-05, + "long_answer_loss": 0.1573, + "loss": 0.1464, + "short_answer_loss": NaN, + "step": 1010, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1624, + "grad_norm": 1.359375, + "learning_rate": 1.755921406503593e-05, + "long_answer_loss": 0.1624, + "loss": 0.1454, + "short_answer_loss": NaN, + "step": 1011, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1365, + "grad_norm": 1.3125, + "learning_rate": 1.754505579354188e-05, + "long_answer_loss": 0.1365, + "loss": 0.139, + "short_answer_loss": NaN, + "step": 1012, + "template_loss": 0.0 + }, + { + "epoch": 0.77, + "full_loss": 0.1558, + "grad_norm": 1.5546875, + "learning_rate": 1.7530889785900555e-05, + "long_answer_loss": 0.1558, + "loss": 0.1455, + "short_answer_loss": NaN, + "step": 1013, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1672, + "grad_norm": 1.4453125, + "learning_rate": 1.7516716063834278e-05, + "long_answer_loss": 0.1672, + "loss": 0.1458, + "short_answer_loss": NaN, + "step": 1014, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1342, + "grad_norm": 1.4921875, + "learning_rate": 1.7502534649077197e-05, + "long_answer_loss": 0.1342, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 1015, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1341, + "grad_norm": 1.3828125, + "learning_rate": 1.748834556337526e-05, + "long_answer_loss": 0.1341, + "loss": 0.1462, + "short_answer_loss": NaN, + "step": 1016, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1512, + "grad_norm": 1.34375, + "learning_rate": 1.7474148828486176e-05, + "long_answer_loss": 0.1512, + "loss": 0.145, + "short_answer_loss": NaN, + "step": 1017, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1629, + "grad_norm": 1.328125, + "learning_rate": 1.7459944466179377e-05, + "long_answer_loss": 0.1629, + "loss": 0.1442, + "short_answer_loss": NaN, + "step": 1018, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1454, + "grad_norm": 1.4296875, + "learning_rate": 1.744573249823601e-05, + "long_answer_loss": 0.1454, + "loss": 0.1408, + "short_answer_loss": NaN, + "step": 1019, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.135, + "grad_norm": 1.421875, + "learning_rate": 1.7431512946448862e-05, + "long_answer_loss": 0.135, + "loss": 0.137, + "short_answer_loss": NaN, + "step": 1020, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1469, + "grad_norm": 1.6171875, + "learning_rate": 1.741728583262236e-05, + "long_answer_loss": 0.1469, + "loss": 0.1501, + "short_answer_loss": NaN, + "step": 1021, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.132, + "grad_norm": 1.375, + "learning_rate": 1.7403051178572528e-05, + "long_answer_loss": 0.132, + "loss": 0.1334, + "short_answer_loss": NaN, + "step": 1022, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1249, + "grad_norm": 1.4375, + "learning_rate": 1.738880900612695e-05, + "long_answer_loss": 0.1249, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 1023, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1424, + "grad_norm": 1.6171875, + "learning_rate": 1.7374559337124743e-05, + "long_answer_loss": 0.1424, + "loss": 0.1475, + "short_answer_loss": NaN, + "step": 1024, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1555, + "grad_norm": 1.453125, + "learning_rate": 1.736030219341651e-05, + "long_answer_loss": 0.1555, + "loss": 0.1428, + "short_answer_loss": NaN, + "step": 1025, + "template_loss": 0.0 + }, + { + "epoch": 0.78, + "full_loss": 0.1272, + "grad_norm": 1.3984375, + "learning_rate": 1.7346037596864322e-05, + "long_answer_loss": 0.1272, + "loss": 0.1353, + "short_answer_loss": NaN, + "step": 1026, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1666, + "grad_norm": 1.4609375, + "learning_rate": 1.733176556934168e-05, + "long_answer_loss": 0.1666, + "loss": 0.1495, + "short_answer_loss": NaN, + "step": 1027, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1335, + "grad_norm": 1.46875, + "learning_rate": 1.731748613273347e-05, + "long_answer_loss": 0.1335, + "loss": 0.1372, + "short_answer_loss": NaN, + "step": 1028, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1394, + "grad_norm": 1.4609375, + "learning_rate": 1.7303199308935956e-05, + "long_answer_loss": 0.1394, + "loss": 0.1408, + "short_answer_loss": NaN, + "step": 1029, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1159, + "grad_norm": 1.5078125, + "learning_rate": 1.7288905119856717e-05, + "long_answer_loss": 0.1159, + "loss": 0.1358, + "short_answer_loss": NaN, + "step": 1030, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.123, + "grad_norm": 1.4453125, + "learning_rate": 1.7274603587414622e-05, + "long_answer_loss": 0.123, + "loss": 0.1401, + "short_answer_loss": NaN, + "step": 1031, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1443, + "grad_norm": 1.5625, + "learning_rate": 1.726029473353982e-05, + "long_answer_loss": 0.1443, + "loss": 0.1413, + "short_answer_loss": NaN, + "step": 1032, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1265, + "grad_norm": 1.3515625, + "learning_rate": 1.724597858017366e-05, + "long_answer_loss": 0.1265, + "loss": 0.1499, + "short_answer_loss": NaN, + "step": 1033, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1579, + "grad_norm": 1.4375, + "learning_rate": 1.723165514926871e-05, + "long_answer_loss": 0.1579, + "loss": 0.1498, + "short_answer_loss": NaN, + "step": 1034, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.115, + "grad_norm": 1.421875, + "learning_rate": 1.7217324462788676e-05, + "long_answer_loss": 0.115, + "loss": 0.1454, + "short_answer_loss": NaN, + "step": 1035, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1249, + "grad_norm": 1.3984375, + "learning_rate": 1.720298654270841e-05, + "long_answer_loss": 0.1249, + "loss": 0.1408, + "short_answer_loss": NaN, + "step": 1036, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1493, + "grad_norm": 1.375, + "learning_rate": 1.7188641411013833e-05, + "long_answer_loss": 0.1493, + "loss": 0.1411, + "short_answer_loss": NaN, + "step": 1037, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1663, + "grad_norm": 1.375, + "learning_rate": 1.7174289089701944e-05, + "long_answer_loss": 0.1663, + "loss": 0.1463, + "short_answer_loss": NaN, + "step": 1038, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1366, + "grad_norm": 1.3984375, + "learning_rate": 1.7159929600780765e-05, + "long_answer_loss": 0.1366, + "loss": 0.1406, + "short_answer_loss": NaN, + "step": 1039, + "template_loss": 0.0 + }, + { + "epoch": 0.79, + "full_loss": 0.1414, + "grad_norm": 1.4375, + "learning_rate": 1.7145562966269294e-05, + "long_answer_loss": 0.1414, + "loss": 0.1497, + "short_answer_loss": NaN, + "step": 1040, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1268, + "grad_norm": 1.359375, + "learning_rate": 1.71311892081975e-05, + "long_answer_loss": 0.1268, + "loss": 0.1374, + "short_answer_loss": NaN, + "step": 1041, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1364, + "grad_norm": 1.4921875, + "learning_rate": 1.7116808348606266e-05, + "long_answer_loss": 0.1364, + "loss": 0.143, + "short_answer_loss": NaN, + "step": 1042, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.13, + "grad_norm": 1.484375, + "learning_rate": 1.7102420409547374e-05, + "long_answer_loss": 0.13, + "loss": 0.1385, + "short_answer_loss": NaN, + "step": 1043, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1638, + "grad_norm": 1.4375, + "learning_rate": 1.7088025413083462e-05, + "long_answer_loss": 0.1638, + "loss": 0.1432, + "short_answer_loss": NaN, + "step": 1044, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1319, + "grad_norm": 1.40625, + "learning_rate": 1.7073623381287976e-05, + "long_answer_loss": 0.1319, + "loss": 0.1352, + "short_answer_loss": NaN, + "step": 1045, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1621, + "grad_norm": 1.4765625, + "learning_rate": 1.7059214336245164e-05, + "long_answer_loss": 0.1621, + "loss": 0.1478, + "short_answer_loss": NaN, + "step": 1046, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1217, + "grad_norm": 1.46875, + "learning_rate": 1.7044798300050025e-05, + "long_answer_loss": 0.1217, + "loss": 0.1409, + "short_answer_loss": NaN, + "step": 1047, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1424, + "grad_norm": 1.5078125, + "learning_rate": 1.703037529480827e-05, + "long_answer_loss": 0.1424, + "loss": 0.137, + "short_answer_loss": NaN, + "step": 1048, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1401, + "grad_norm": 1.4296875, + "learning_rate": 1.7015945342636307e-05, + "long_answer_loss": 0.1401, + "loss": 0.1443, + "short_answer_loss": NaN, + "step": 1049, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1351, + "grad_norm": 1.4921875, + "learning_rate": 1.70015084656612e-05, + "long_answer_loss": 0.1351, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 1050, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1336, + "grad_norm": 1.375, + "learning_rate": 1.698706468602061e-05, + "long_answer_loss": 0.1336, + "loss": 0.1328, + "short_answer_loss": NaN, + "step": 1051, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1307, + "grad_norm": 1.3671875, + "learning_rate": 1.6972614025862805e-05, + "long_answer_loss": 0.1307, + "loss": 0.1378, + "short_answer_loss": NaN, + "step": 1052, + "template_loss": 0.0 + }, + { + "epoch": 0.8, + "full_loss": 0.1622, + "grad_norm": 1.46875, + "learning_rate": 1.6958156507346592e-05, + "long_answer_loss": 0.1622, + "loss": 0.1553, + "short_answer_loss": NaN, + "step": 1053, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.144, + "grad_norm": 1.453125, + "learning_rate": 1.6943692152641303e-05, + "long_answer_loss": 0.144, + "loss": 0.1455, + "short_answer_loss": NaN, + "step": 1054, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1307, + "grad_norm": 1.4140625, + "learning_rate": 1.6929220983926748e-05, + "long_answer_loss": 0.1307, + "loss": 0.1342, + "short_answer_loss": NaN, + "step": 1055, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1492, + "grad_norm": 1.3828125, + "learning_rate": 1.691474302339318e-05, + "long_answer_loss": 0.1492, + "loss": 0.1402, + "short_answer_loss": NaN, + "step": 1056, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1517, + "grad_norm": 1.34375, + "learning_rate": 1.690025829324127e-05, + "long_answer_loss": 0.1517, + "loss": 0.1461, + "short_answer_loss": NaN, + "step": 1057, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1439, + "grad_norm": 1.359375, + "learning_rate": 1.6885766815682087e-05, + "long_answer_loss": 0.1439, + "loss": 0.1465, + "short_answer_loss": NaN, + "step": 1058, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1183, + "grad_norm": 1.4140625, + "learning_rate": 1.6871268612937013e-05, + "long_answer_loss": 0.1183, + "loss": 0.1372, + "short_answer_loss": NaN, + "step": 1059, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1304, + "grad_norm": 1.3671875, + "learning_rate": 1.6856763707237776e-05, + "long_answer_loss": 0.1304, + "loss": 0.141, + "short_answer_loss": NaN, + "step": 1060, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1255, + "grad_norm": 1.5234375, + "learning_rate": 1.6842252120826358e-05, + "long_answer_loss": 0.1255, + "loss": 0.1449, + "short_answer_loss": NaN, + "step": 1061, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1385, + "grad_norm": 1.5234375, + "learning_rate": 1.6827733875954994e-05, + "long_answer_loss": 0.1385, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 1062, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1537, + "grad_norm": 1.359375, + "learning_rate": 1.6813208994886135e-05, + "long_answer_loss": 0.1537, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 1063, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1223, + "grad_norm": 1.4375, + "learning_rate": 1.6798677499892397e-05, + "long_answer_loss": 0.1223, + "loss": 0.1493, + "short_answer_loss": NaN, + "step": 1064, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1459, + "grad_norm": 1.4609375, + "learning_rate": 1.678413941325655e-05, + "long_answer_loss": 0.1459, + "loss": 0.1456, + "short_answer_loss": NaN, + "step": 1065, + "template_loss": 0.0 + }, + { + "epoch": 0.81, + "full_loss": 0.1361, + "grad_norm": 1.46875, + "learning_rate": 1.6769594757271463e-05, + "long_answer_loss": 0.1361, + "loss": 0.1458, + "short_answer_loss": NaN, + "step": 1066, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1082, + "grad_norm": 1.46875, + "learning_rate": 1.6755043554240077e-05, + "long_answer_loss": 0.1082, + "loss": 0.1447, + "short_answer_loss": NaN, + "step": 1067, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1744, + "grad_norm": 1.4140625, + "learning_rate": 1.674048582647538e-05, + "long_answer_loss": 0.1744, + "loss": 0.1476, + "short_answer_loss": NaN, + "step": 1068, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1397, + "grad_norm": 1.6015625, + "learning_rate": 1.672592159630036e-05, + "long_answer_loss": 0.1397, + "loss": 0.1443, + "short_answer_loss": NaN, + "step": 1069, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1424, + "grad_norm": 1.53125, + "learning_rate": 1.6711350886047977e-05, + "long_answer_loss": 0.1424, + "loss": 0.1371, + "short_answer_loss": NaN, + "step": 1070, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1403, + "grad_norm": 1.3203125, + "learning_rate": 1.6696773718061128e-05, + "long_answer_loss": 0.1403, + "loss": 0.1335, + "short_answer_loss": NaN, + "step": 1071, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1599, + "grad_norm": 1.4609375, + "learning_rate": 1.6682190114692615e-05, + "long_answer_loss": 0.1599, + "loss": 0.1405, + "short_answer_loss": NaN, + "step": 1072, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.127, + "grad_norm": 1.3984375, + "learning_rate": 1.66676000983051e-05, + "long_answer_loss": 0.127, + "loss": 0.1411, + "short_answer_loss": NaN, + "step": 1073, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1255, + "grad_norm": 1.34375, + "learning_rate": 1.665300369127108e-05, + "long_answer_loss": 0.1255, + "loss": 0.1324, + "short_answer_loss": NaN, + "step": 1074, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1474, + "grad_norm": 1.4375, + "learning_rate": 1.6638400915972867e-05, + "long_answer_loss": 0.1474, + "loss": 0.1442, + "short_answer_loss": NaN, + "step": 1075, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1499, + "grad_norm": 1.4140625, + "learning_rate": 1.6623791794802518e-05, + "long_answer_loss": 0.1499, + "loss": 0.1438, + "short_answer_loss": NaN, + "step": 1076, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1433, + "grad_norm": 1.390625, + "learning_rate": 1.6609176350161836e-05, + "long_answer_loss": 0.1433, + "loss": 0.1397, + "short_answer_loss": NaN, + "step": 1077, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.1669, + "grad_norm": 1.4609375, + "learning_rate": 1.659455460446231e-05, + "long_answer_loss": 0.1669, + "loss": 0.1459, + "short_answer_loss": NaN, + "step": 1078, + "template_loss": 0.0 + }, + { + "epoch": 0.82, + "full_loss": 0.135, + "grad_norm": 1.4453125, + "learning_rate": 1.6579926580125095e-05, + "long_answer_loss": 0.135, + "loss": 0.1417, + "short_answer_loss": NaN, + "step": 1079, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1372, + "grad_norm": 1.4375, + "learning_rate": 1.656529229958097e-05, + "long_answer_loss": 0.1372, + "loss": 0.1428, + "short_answer_loss": NaN, + "step": 1080, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1415, + "grad_norm": 1.3828125, + "learning_rate": 1.6550651785270323e-05, + "long_answer_loss": 0.1415, + "loss": 0.1378, + "short_answer_loss": NaN, + "step": 1081, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1349, + "grad_norm": 1.4765625, + "learning_rate": 1.653600505964308e-05, + "long_answer_loss": 0.1349, + "loss": 0.1386, + "short_answer_loss": NaN, + "step": 1082, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1367, + "grad_norm": 1.4921875, + "learning_rate": 1.65213521451587e-05, + "long_answer_loss": 0.1367, + "loss": 0.1478, + "short_answer_loss": NaN, + "step": 1083, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1538, + "grad_norm": 1.5, + "learning_rate": 1.650669306428613e-05, + "long_answer_loss": 0.1538, + "loss": 0.1439, + "short_answer_loss": NaN, + "step": 1084, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1326, + "grad_norm": 1.3984375, + "learning_rate": 1.6492027839503788e-05, + "long_answer_loss": 0.1326, + "loss": 0.1413, + "short_answer_loss": NaN, + "step": 1085, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1394, + "grad_norm": 1.421875, + "learning_rate": 1.647735649329949e-05, + "long_answer_loss": 0.1394, + "loss": 0.1388, + "short_answer_loss": NaN, + "step": 1086, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1488, + "grad_norm": 1.390625, + "learning_rate": 1.646267904817045e-05, + "long_answer_loss": 0.1488, + "loss": 0.1431, + "short_answer_loss": NaN, + "step": 1087, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.137, + "grad_norm": 1.4453125, + "learning_rate": 1.644799552662323e-05, + "long_answer_loss": 0.137, + "loss": 0.1355, + "short_answer_loss": NaN, + "step": 1088, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1189, + "grad_norm": 1.359375, + "learning_rate": 1.643330595117372e-05, + "long_answer_loss": 0.1189, + "loss": 0.1385, + "short_answer_loss": NaN, + "step": 1089, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1385, + "grad_norm": 1.421875, + "learning_rate": 1.6418610344347085e-05, + "long_answer_loss": 0.1385, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 1090, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.121, + "grad_norm": 1.46875, + "learning_rate": 1.640390872867774e-05, + "long_answer_loss": 0.121, + "loss": 0.1361, + "short_answer_loss": NaN, + "step": 1091, + "template_loss": 0.0 + }, + { + "epoch": 0.83, + "full_loss": 0.1373, + "grad_norm": 1.390625, + "learning_rate": 1.6389201126709307e-05, + "long_answer_loss": 0.1373, + "loss": 0.135, + "short_answer_loss": NaN, + "step": 1092, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.132, + "grad_norm": 1.421875, + "learning_rate": 1.63744875609946e-05, + "long_answer_loss": 0.132, + "loss": 0.1382, + "short_answer_loss": NaN, + "step": 1093, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1161, + "grad_norm": 1.375, + "learning_rate": 1.6359768054095574e-05, + "long_answer_loss": 0.1161, + "loss": 0.1361, + "short_answer_loss": NaN, + "step": 1094, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1453, + "grad_norm": 1.375, + "learning_rate": 1.6345042628583284e-05, + "long_answer_loss": 0.1453, + "loss": 0.1388, + "short_answer_loss": NaN, + "step": 1095, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1506, + "grad_norm": 1.4375, + "learning_rate": 1.6330311307037875e-05, + "long_answer_loss": 0.1506, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1096, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1316, + "grad_norm": 1.4375, + "learning_rate": 1.6315574112048523e-05, + "long_answer_loss": 0.1316, + "loss": 0.1449, + "short_answer_loss": NaN, + "step": 1097, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1235, + "grad_norm": 1.375, + "learning_rate": 1.630083106621342e-05, + "long_answer_loss": 0.1235, + "loss": 0.1381, + "short_answer_loss": NaN, + "step": 1098, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1347, + "grad_norm": 1.453125, + "learning_rate": 1.628608219213972e-05, + "long_answer_loss": 0.1347, + "loss": 0.1365, + "short_answer_loss": NaN, + "step": 1099, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1537, + "grad_norm": 1.453125, + "learning_rate": 1.6271327512443517e-05, + "long_answer_loss": 0.1537, + "loss": 0.1459, + "short_answer_loss": NaN, + "step": 1100, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1464, + "grad_norm": 1.390625, + "learning_rate": 1.6256567049749815e-05, + "long_answer_loss": 0.1464, + "loss": 0.1397, + "short_answer_loss": NaN, + "step": 1101, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1543, + "grad_norm": 1.421875, + "learning_rate": 1.6241800826692472e-05, + "long_answer_loss": 0.1543, + "loss": 0.1424, + "short_answer_loss": NaN, + "step": 1102, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1348, + "grad_norm": 1.5546875, + "learning_rate": 1.6227028865914188e-05, + "long_answer_loss": 0.1348, + "loss": 0.1382, + "short_answer_loss": NaN, + "step": 1103, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1496, + "grad_norm": 1.4375, + "learning_rate": 1.621225119006646e-05, + "long_answer_loss": 0.1496, + "loss": 0.146, + "short_answer_loss": NaN, + "step": 1104, + "template_loss": 0.0 + }, + { + "epoch": 0.84, + "full_loss": 0.1386, + "grad_norm": 1.421875, + "learning_rate": 1.619746782180955e-05, + "long_answer_loss": 0.1386, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1105, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1136, + "grad_norm": 1.453125, + "learning_rate": 1.6182678783812444e-05, + "long_answer_loss": 0.1136, + "loss": 0.1429, + "short_answer_loss": NaN, + "step": 1106, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1458, + "grad_norm": 1.3359375, + "learning_rate": 1.6167884098752835e-05, + "long_answer_loss": 0.1458, + "loss": 0.1399, + "short_answer_loss": NaN, + "step": 1107, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1425, + "grad_norm": 1.4609375, + "learning_rate": 1.6153083789317047e-05, + "long_answer_loss": 0.1425, + "loss": 0.1443, + "short_answer_loss": NaN, + "step": 1108, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1617, + "grad_norm": 1.375, + "learning_rate": 1.613827787820006e-05, + "long_answer_loss": 0.1617, + "loss": 0.1401, + "short_answer_loss": NaN, + "step": 1109, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1546, + "grad_norm": 1.578125, + "learning_rate": 1.612346638810543e-05, + "long_answer_loss": 0.1546, + "loss": 0.1324, + "short_answer_loss": NaN, + "step": 1110, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1539, + "grad_norm": 1.421875, + "learning_rate": 1.6108649341745262e-05, + "long_answer_loss": 0.1539, + "loss": 0.1411, + "short_answer_loss": NaN, + "step": 1111, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1427, + "grad_norm": 1.4609375, + "learning_rate": 1.6093826761840196e-05, + "long_answer_loss": 0.1427, + "loss": 0.1475, + "short_answer_loss": NaN, + "step": 1112, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.148, + "grad_norm": 1.453125, + "learning_rate": 1.607899867111934e-05, + "long_answer_loss": 0.148, + "loss": 0.1361, + "short_answer_loss": NaN, + "step": 1113, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1191, + "grad_norm": 1.359375, + "learning_rate": 1.6064165092320264e-05, + "long_answer_loss": 0.1191, + "loss": 0.1328, + "short_answer_loss": NaN, + "step": 1114, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1792, + "grad_norm": 1.46875, + "learning_rate": 1.6049326048188955e-05, + "long_answer_loss": 0.1792, + "loss": 0.144, + "short_answer_loss": NaN, + "step": 1115, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1313, + "grad_norm": 1.453125, + "learning_rate": 1.6034481561479765e-05, + "long_answer_loss": 0.1313, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1116, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1359, + "grad_norm": 1.4609375, + "learning_rate": 1.6019631654955412e-05, + "long_answer_loss": 0.1359, + "loss": 0.144, + "short_answer_loss": NaN, + "step": 1117, + "template_loss": 0.0 + }, + { + "epoch": 0.85, + "full_loss": 0.1277, + "grad_norm": 1.3671875, + "learning_rate": 1.6004776351386913e-05, + "long_answer_loss": 0.1277, + "loss": 0.135, + "short_answer_loss": NaN, + "step": 1118, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1505, + "grad_norm": 1.40625, + "learning_rate": 1.5989915673553564e-05, + "long_answer_loss": 0.1505, + "loss": 0.1391, + "short_answer_loss": NaN, + "step": 1119, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1302, + "grad_norm": 1.453125, + "learning_rate": 1.59750496442429e-05, + "long_answer_loss": 0.1302, + "loss": 0.1447, + "short_answer_loss": NaN, + "step": 1120, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1207, + "grad_norm": 1.3359375, + "learning_rate": 1.5960178286250668e-05, + "long_answer_loss": 0.1207, + "loss": 0.1341, + "short_answer_loss": NaN, + "step": 1121, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1697, + "grad_norm": 1.4375, + "learning_rate": 1.5945301622380772e-05, + "long_answer_loss": 0.1697, + "loss": 0.1441, + "short_answer_loss": NaN, + "step": 1122, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1466, + "grad_norm": 1.359375, + "learning_rate": 1.5930419675445273e-05, + "long_answer_loss": 0.1466, + "loss": 0.1319, + "short_answer_loss": NaN, + "step": 1123, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1492, + "grad_norm": 1.5078125, + "learning_rate": 1.5915532468264314e-05, + "long_answer_loss": 0.1492, + "loss": 0.1359, + "short_answer_loss": NaN, + "step": 1124, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1341, + "grad_norm": 1.3984375, + "learning_rate": 1.5900640023666108e-05, + "long_answer_loss": 0.1341, + "loss": 0.1454, + "short_answer_loss": NaN, + "step": 1125, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1246, + "grad_norm": 1.375, + "learning_rate": 1.5885742364486915e-05, + "long_answer_loss": 0.1246, + "loss": 0.1365, + "short_answer_loss": NaN, + "step": 1126, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1449, + "grad_norm": 1.4453125, + "learning_rate": 1.5870839513570967e-05, + "long_answer_loss": 0.1449, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1127, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.149, + "grad_norm": 1.4296875, + "learning_rate": 1.5855931493770477e-05, + "long_answer_loss": 0.149, + "loss": 0.1423, + "short_answer_loss": NaN, + "step": 1128, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1357, + "grad_norm": 1.4453125, + "learning_rate": 1.5841018327945576e-05, + "long_answer_loss": 0.1357, + "loss": 0.145, + "short_answer_loss": NaN, + "step": 1129, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1314, + "grad_norm": 1.4609375, + "learning_rate": 1.5826100038964282e-05, + "long_answer_loss": 0.1314, + "loss": 0.1373, + "short_answer_loss": NaN, + "step": 1130, + "template_loss": 0.0 + }, + { + "epoch": 0.86, + "full_loss": 0.1553, + "grad_norm": 1.390625, + "learning_rate": 1.581117664970247e-05, + "long_answer_loss": 0.1553, + "loss": 0.1332, + "short_answer_loss": NaN, + "step": 1131, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1194, + "grad_norm": 1.46875, + "learning_rate": 1.5796248183043848e-05, + "long_answer_loss": 0.1194, + "loss": 0.1375, + "short_answer_loss": NaN, + "step": 1132, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1377, + "grad_norm": 1.4140625, + "learning_rate": 1.5781314661879896e-05, + "long_answer_loss": 0.1377, + "loss": 0.1301, + "short_answer_loss": NaN, + "step": 1133, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.153, + "grad_norm": 1.515625, + "learning_rate": 1.5766376109109847e-05, + "long_answer_loss": 0.153, + "loss": 0.1465, + "short_answer_loss": NaN, + "step": 1134, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1211, + "grad_norm": 1.4765625, + "learning_rate": 1.5751432547640655e-05, + "long_answer_loss": 0.1211, + "loss": 0.1371, + "short_answer_loss": NaN, + "step": 1135, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1608, + "grad_norm": 1.453125, + "learning_rate": 1.573648400038695e-05, + "long_answer_loss": 0.1608, + "loss": 0.1442, + "short_answer_loss": NaN, + "step": 1136, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1268, + "grad_norm": 1.40625, + "learning_rate": 1.572153049027101e-05, + "long_answer_loss": 0.1268, + "loss": 0.1373, + "short_answer_loss": NaN, + "step": 1137, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1505, + "grad_norm": 1.4609375, + "learning_rate": 1.5706572040222715e-05, + "long_answer_loss": 0.1505, + "loss": 0.1437, + "short_answer_loss": NaN, + "step": 1138, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1376, + "grad_norm": 1.484375, + "learning_rate": 1.5691608673179532e-05, + "long_answer_loss": 0.1376, + "loss": 0.1398, + "short_answer_loss": NaN, + "step": 1139, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1378, + "grad_norm": 1.46875, + "learning_rate": 1.5676640412086463e-05, + "long_answer_loss": 0.1378, + "loss": 0.1319, + "short_answer_loss": NaN, + "step": 1140, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1355, + "grad_norm": 1.34375, + "learning_rate": 1.566166727989601e-05, + "long_answer_loss": 0.1355, + "loss": 0.129, + "short_answer_loss": NaN, + "step": 1141, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1417, + "grad_norm": 1.4765625, + "learning_rate": 1.564668929956815e-05, + "long_answer_loss": 0.1417, + "loss": 0.1411, + "short_answer_loss": NaN, + "step": 1142, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1259, + "grad_norm": 1.453125, + "learning_rate": 1.5631706494070298e-05, + "long_answer_loss": 0.1259, + "loss": 0.1447, + "short_answer_loss": NaN, + "step": 1143, + "template_loss": 0.0 + }, + { + "epoch": 0.87, + "full_loss": 0.1341, + "grad_norm": 1.390625, + "learning_rate": 1.5616718886377253e-05, + "long_answer_loss": 0.1341, + "loss": 0.1326, + "short_answer_loss": NaN, + "step": 1144, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1329, + "grad_norm": 1.4296875, + "learning_rate": 1.5601726499471193e-05, + "long_answer_loss": 0.1329, + "loss": 0.131, + "short_answer_loss": NaN, + "step": 1145, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.118, + "grad_norm": 1.359375, + "learning_rate": 1.558672935634161e-05, + "long_answer_loss": 0.118, + "loss": 0.1346, + "short_answer_loss": NaN, + "step": 1146, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1564, + "grad_norm": 1.390625, + "learning_rate": 1.557172747998531e-05, + "long_answer_loss": 0.1564, + "loss": 0.1375, + "short_answer_loss": NaN, + "step": 1147, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1356, + "grad_norm": 1.5078125, + "learning_rate": 1.555672089340634e-05, + "long_answer_loss": 0.1356, + "loss": 0.1357, + "short_answer_loss": NaN, + "step": 1148, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.137, + "grad_norm": 1.4453125, + "learning_rate": 1.554170961961597e-05, + "long_answer_loss": 0.137, + "loss": 0.1416, + "short_answer_loss": NaN, + "step": 1149, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1249, + "grad_norm": 1.46875, + "learning_rate": 1.5526693681632664e-05, + "long_answer_loss": 0.1249, + "loss": 0.1455, + "short_answer_loss": NaN, + "step": 1150, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1419, + "grad_norm": 1.4453125, + "learning_rate": 1.5511673102482044e-05, + "long_answer_loss": 0.1419, + "loss": 0.1373, + "short_answer_loss": NaN, + "step": 1151, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1412, + "grad_norm": 1.6328125, + "learning_rate": 1.549664790519683e-05, + "long_answer_loss": 0.1412, + "loss": 0.1469, + "short_answer_loss": NaN, + "step": 1152, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1286, + "grad_norm": 1.4140625, + "learning_rate": 1.5481618112816844e-05, + "long_answer_loss": 0.1286, + "loss": 0.136, + "short_answer_loss": NaN, + "step": 1153, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1332, + "grad_norm": 1.40625, + "learning_rate": 1.546658374838894e-05, + "long_answer_loss": 0.1332, + "loss": 0.1317, + "short_answer_loss": NaN, + "step": 1154, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1415, + "grad_norm": 1.4140625, + "learning_rate": 1.545154483496698e-05, + "long_answer_loss": 0.1415, + "loss": 0.1436, + "short_answer_loss": NaN, + "step": 1155, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1331, + "grad_norm": 1.515625, + "learning_rate": 1.543650139561182e-05, + "long_answer_loss": 0.1331, + "loss": 0.1448, + "short_answer_loss": NaN, + "step": 1156, + "template_loss": 0.0 + }, + { + "epoch": 0.88, + "full_loss": 0.1594, + "grad_norm": 1.453125, + "learning_rate": 1.542145345339124e-05, + "long_answer_loss": 0.1594, + "loss": 0.1387, + "short_answer_loss": NaN, + "step": 1157, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1549, + "grad_norm": 1.40625, + "learning_rate": 1.540640103137993e-05, + "long_answer_loss": 0.1549, + "loss": 0.1451, + "short_answer_loss": NaN, + "step": 1158, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1575, + "grad_norm": 1.390625, + "learning_rate": 1.539134415265945e-05, + "long_answer_loss": 0.1575, + "loss": 0.142, + "short_answer_loss": NaN, + "step": 1159, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1338, + "grad_norm": 1.4609375, + "learning_rate": 1.5376282840318196e-05, + "long_answer_loss": 0.1338, + "loss": 0.1372, + "short_answer_loss": NaN, + "step": 1160, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1499, + "grad_norm": 1.375, + "learning_rate": 1.5361217117451355e-05, + "long_answer_loss": 0.1499, + "loss": 0.1331, + "short_answer_loss": NaN, + "step": 1161, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1488, + "grad_norm": 1.5078125, + "learning_rate": 1.534614700716088e-05, + "long_answer_loss": 0.1488, + "loss": 0.1432, + "short_answer_loss": NaN, + "step": 1162, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.172, + "grad_norm": 1.453125, + "learning_rate": 1.5331072532555462e-05, + "long_answer_loss": 0.172, + "loss": 0.1418, + "short_answer_loss": NaN, + "step": 1163, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1356, + "grad_norm": 1.40625, + "learning_rate": 1.5315993716750472e-05, + "long_answer_loss": 0.1356, + "loss": 0.1302, + "short_answer_loss": NaN, + "step": 1164, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1372, + "grad_norm": 1.4140625, + "learning_rate": 1.5300910582867933e-05, + "long_answer_loss": 0.1372, + "loss": 0.1361, + "short_answer_loss": NaN, + "step": 1165, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1568, + "grad_norm": 1.46875, + "learning_rate": 1.528582315403651e-05, + "long_answer_loss": 0.1568, + "loss": 0.1498, + "short_answer_loss": NaN, + "step": 1166, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1185, + "grad_norm": 1.4453125, + "learning_rate": 1.527073145339144e-05, + "long_answer_loss": 0.1185, + "loss": 0.1337, + "short_answer_loss": NaN, + "step": 1167, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1493, + "grad_norm": 1.4765625, + "learning_rate": 1.5255635504074503e-05, + "long_answer_loss": 0.1493, + "loss": 0.1374, + "short_answer_loss": NaN, + "step": 1168, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1183, + "grad_norm": 1.4375, + "learning_rate": 1.5240535329234012e-05, + "long_answer_loss": 0.1183, + "loss": 0.1348, + "short_answer_loss": NaN, + "step": 1169, + "template_loss": 0.0 + }, + { + "epoch": 0.89, + "full_loss": 0.1362, + "grad_norm": 1.3203125, + "learning_rate": 1.522543095202475e-05, + "long_answer_loss": 0.1362, + "loss": 0.1335, + "short_answer_loss": NaN, + "step": 1170, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1614, + "grad_norm": 1.5, + "learning_rate": 1.5210322395607945e-05, + "long_answer_loss": 0.1614, + "loss": 0.1402, + "short_answer_loss": NaN, + "step": 1171, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1417, + "grad_norm": 1.515625, + "learning_rate": 1.519520968315123e-05, + "long_answer_loss": 0.1417, + "loss": 0.1339, + "short_answer_loss": NaN, + "step": 1172, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1462, + "grad_norm": 1.3984375, + "learning_rate": 1.5180092837828618e-05, + "long_answer_loss": 0.1462, + "loss": 0.1358, + "short_answer_loss": NaN, + "step": 1173, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1245, + "grad_norm": 1.5390625, + "learning_rate": 1.5164971882820456e-05, + "long_answer_loss": 0.1245, + "loss": 0.1403, + "short_answer_loss": NaN, + "step": 1174, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1299, + "grad_norm": 1.3671875, + "learning_rate": 1.5149846841313389e-05, + "long_answer_loss": 0.1299, + "loss": 0.1352, + "short_answer_loss": NaN, + "step": 1175, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.152, + "grad_norm": 1.375, + "learning_rate": 1.513471773650033e-05, + "long_answer_loss": 0.152, + "loss": 0.1356, + "short_answer_loss": NaN, + "step": 1176, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1594, + "grad_norm": 1.484375, + "learning_rate": 1.5119584591580429e-05, + "long_answer_loss": 0.1594, + "loss": 0.148, + "short_answer_loss": NaN, + "step": 1177, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1353, + "grad_norm": 1.390625, + "learning_rate": 1.5104447429759024e-05, + "long_answer_loss": 0.1353, + "loss": 0.1365, + "short_answer_loss": NaN, + "step": 1178, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1544, + "grad_norm": 1.390625, + "learning_rate": 1.5089306274247616e-05, + "long_answer_loss": 0.1544, + "loss": 0.1402, + "short_answer_loss": NaN, + "step": 1179, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1402, + "grad_norm": 1.3984375, + "learning_rate": 1.507416114826383e-05, + "long_answer_loss": 0.1402, + "loss": 0.1405, + "short_answer_loss": NaN, + "step": 1180, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1442, + "grad_norm": 1.4140625, + "learning_rate": 1.5059012075031378e-05, + "long_answer_loss": 0.1442, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1181, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1276, + "grad_norm": 1.453125, + "learning_rate": 1.5043859077780026e-05, + "long_answer_loss": 0.1276, + "loss": 0.1354, + "short_answer_loss": NaN, + "step": 1182, + "template_loss": 0.0 + }, + { + "epoch": 0.9, + "full_loss": 0.1279, + "grad_norm": 1.390625, + "learning_rate": 1.5028702179745554e-05, + "long_answer_loss": 0.1279, + "loss": 0.1331, + "short_answer_loss": NaN, + "step": 1183, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1547, + "grad_norm": 1.3515625, + "learning_rate": 1.501354140416973e-05, + "long_answer_loss": 0.1547, + "loss": 0.1367, + "short_answer_loss": NaN, + "step": 1184, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1252, + "grad_norm": 1.3828125, + "learning_rate": 1.4998376774300257e-05, + "long_answer_loss": 0.1252, + "loss": 0.1347, + "short_answer_loss": NaN, + "step": 1185, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1316, + "grad_norm": 1.484375, + "learning_rate": 1.498320831339076e-05, + "long_answer_loss": 0.1316, + "loss": 0.1353, + "short_answer_loss": NaN, + "step": 1186, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1248, + "grad_norm": 1.390625, + "learning_rate": 1.4968036044700729e-05, + "long_answer_loss": 0.1248, + "loss": 0.138, + "short_answer_loss": NaN, + "step": 1187, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.164, + "grad_norm": 1.453125, + "learning_rate": 1.4952859991495504e-05, + "long_answer_loss": 0.164, + "loss": 0.1331, + "short_answer_loss": NaN, + "step": 1188, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1244, + "grad_norm": 1.3984375, + "learning_rate": 1.4937680177046218e-05, + "long_answer_loss": 0.1244, + "loss": 0.1313, + "short_answer_loss": NaN, + "step": 1189, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1033, + "grad_norm": 1.4375, + "learning_rate": 1.4922496624629775e-05, + "long_answer_loss": 0.1033, + "loss": 0.1357, + "short_answer_loss": NaN, + "step": 1190, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1276, + "grad_norm": 1.515625, + "learning_rate": 1.4907309357528812e-05, + "long_answer_loss": 0.1276, + "loss": 0.1375, + "short_answer_loss": NaN, + "step": 1191, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1301, + "grad_norm": 1.59375, + "learning_rate": 1.489211839903166e-05, + "long_answer_loss": 0.1301, + "loss": 0.1302, + "short_answer_loss": NaN, + "step": 1192, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1377, + "grad_norm": 1.484375, + "learning_rate": 1.487692377243231e-05, + "long_answer_loss": 0.1377, + "loss": 0.1379, + "short_answer_loss": NaN, + "step": 1193, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1347, + "grad_norm": 1.671875, + "learning_rate": 1.4861725501030389e-05, + "long_answer_loss": 0.1347, + "loss": 0.1357, + "short_answer_loss": NaN, + "step": 1194, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.144, + "grad_norm": 1.375, + "learning_rate": 1.4846523608131088e-05, + "long_answer_loss": 0.144, + "loss": 0.1278, + "short_answer_loss": NaN, + "step": 1195, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.125, + "grad_norm": 1.46875, + "learning_rate": 1.4831318117045177e-05, + "long_answer_loss": 0.125, + "loss": 0.1355, + "short_answer_loss": NaN, + "step": 1196, + "template_loss": 0.0 + }, + { + "epoch": 0.91, + "full_loss": 0.1339, + "grad_norm": 1.4375, + "learning_rate": 1.4816109051088931e-05, + "long_answer_loss": 0.1339, + "loss": 0.1412, + "short_answer_loss": NaN, + "step": 1197, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1622, + "grad_norm": 1.4921875, + "learning_rate": 1.4800896433584107e-05, + "long_answer_loss": 0.1622, + "loss": 0.1453, + "short_answer_loss": NaN, + "step": 1198, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1194, + "grad_norm": 1.3828125, + "learning_rate": 1.4785680287857911e-05, + "long_answer_loss": 0.1194, + "loss": 0.1314, + "short_answer_loss": NaN, + "step": 1199, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1294, + "grad_norm": 1.5234375, + "learning_rate": 1.4770460637242955e-05, + "long_answer_loss": 0.1294, + "loss": 0.133, + "short_answer_loss": NaN, + "step": 1200, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.13, + "grad_norm": 1.4296875, + "learning_rate": 1.4755237505077236e-05, + "long_answer_loss": 0.13, + "loss": 0.1367, + "short_answer_loss": NaN, + "step": 1201, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1223, + "grad_norm": 1.46875, + "learning_rate": 1.4740010914704071e-05, + "long_answer_loss": 0.1223, + "loss": 0.1364, + "short_answer_loss": NaN, + "step": 1202, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1293, + "grad_norm": 1.421875, + "learning_rate": 1.47247808894721e-05, + "long_answer_loss": 0.1293, + "loss": 0.1396, + "short_answer_loss": NaN, + "step": 1203, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1281, + "grad_norm": 1.375, + "learning_rate": 1.4709547452735223e-05, + "long_answer_loss": 0.1281, + "loss": 0.1289, + "short_answer_loss": NaN, + "step": 1204, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1258, + "grad_norm": 1.3984375, + "learning_rate": 1.4694310627852559e-05, + "long_answer_loss": 0.1258, + "loss": 0.1401, + "short_answer_loss": NaN, + "step": 1205, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1266, + "grad_norm": 1.5390625, + "learning_rate": 1.467907043818844e-05, + "long_answer_loss": 0.1266, + "loss": 0.1384, + "short_answer_loss": NaN, + "step": 1206, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1235, + "grad_norm": 1.40625, + "learning_rate": 1.4663826907112348e-05, + "long_answer_loss": 0.1235, + "loss": 0.1303, + "short_answer_loss": NaN, + "step": 1207, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1438, + "grad_norm": 1.4765625, + "learning_rate": 1.464858005799889e-05, + "long_answer_loss": 0.1438, + "loss": 0.1363, + "short_answer_loss": NaN, + "step": 1208, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1449, + "grad_norm": 1.6171875, + "learning_rate": 1.4633329914227761e-05, + "long_answer_loss": 0.1449, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1209, + "template_loss": 0.0 + }, + { + "epoch": 0.92, + "full_loss": 0.1329, + "grad_norm": 1.5859375, + "learning_rate": 1.4618076499183713e-05, + "long_answer_loss": 0.1329, + "loss": 0.1371, + "short_answer_loss": NaN, + "step": 1210, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1368, + "grad_norm": 1.5, + "learning_rate": 1.4602819836256507e-05, + "long_answer_loss": 0.1368, + "loss": 0.1397, + "short_answer_loss": NaN, + "step": 1211, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1345, + "grad_norm": 1.4765625, + "learning_rate": 1.4587559948840892e-05, + "long_answer_loss": 0.1345, + "loss": 0.1374, + "short_answer_loss": NaN, + "step": 1212, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1303, + "grad_norm": 1.3828125, + "learning_rate": 1.4572296860336552e-05, + "long_answer_loss": 0.1303, + "loss": 0.1303, + "short_answer_loss": NaN, + "step": 1213, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1356, + "grad_norm": 1.515625, + "learning_rate": 1.4557030594148086e-05, + "long_answer_loss": 0.1356, + "loss": 0.138, + "short_answer_loss": NaN, + "step": 1214, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1285, + "grad_norm": 1.484375, + "learning_rate": 1.4541761173684965e-05, + "long_answer_loss": 0.1285, + "loss": 0.1302, + "short_answer_loss": NaN, + "step": 1215, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1382, + "grad_norm": 1.515625, + "learning_rate": 1.4526488622361493e-05, + "long_answer_loss": 0.1382, + "loss": 0.1375, + "short_answer_loss": NaN, + "step": 1216, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1162, + "grad_norm": 1.390625, + "learning_rate": 1.4511212963596779e-05, + "long_answer_loss": 0.1162, + "loss": 0.1376, + "short_answer_loss": NaN, + "step": 1217, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1353, + "grad_norm": 1.4140625, + "learning_rate": 1.44959342208147e-05, + "long_answer_loss": 0.1353, + "loss": 0.1363, + "short_answer_loss": NaN, + "step": 1218, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.132, + "grad_norm": 1.4609375, + "learning_rate": 1.4480652417443854e-05, + "long_answer_loss": 0.132, + "loss": 0.1388, + "short_answer_loss": NaN, + "step": 1219, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1203, + "grad_norm": 1.484375, + "learning_rate": 1.446536757691754e-05, + "long_answer_loss": 0.1203, + "loss": 0.1386, + "short_answer_loss": NaN, + "step": 1220, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1492, + "grad_norm": 1.5, + "learning_rate": 1.4450079722673706e-05, + "long_answer_loss": 0.1492, + "loss": 0.1381, + "short_answer_loss": NaN, + "step": 1221, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1228, + "grad_norm": 1.4765625, + "learning_rate": 1.4434788878154928e-05, + "long_answer_loss": 0.1228, + "loss": 0.1335, + "short_answer_loss": NaN, + "step": 1222, + "template_loss": 0.0 + }, + { + "epoch": 0.93, + "full_loss": 0.1201, + "grad_norm": 1.4375, + "learning_rate": 1.4419495066808364e-05, + "long_answer_loss": 0.1201, + "loss": 0.1344, + "short_answer_loss": NaN, + "step": 1223, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1342, + "grad_norm": 1.53125, + "learning_rate": 1.4404198312085723e-05, + "long_answer_loss": 0.1342, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1224, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.13, + "grad_norm": 1.4140625, + "learning_rate": 1.438889863744323e-05, + "long_answer_loss": 0.13, + "loss": 0.1313, + "short_answer_loss": NaN, + "step": 1225, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1214, + "grad_norm": 1.4375, + "learning_rate": 1.4373596066341577e-05, + "long_answer_loss": 0.1214, + "loss": 0.1381, + "short_answer_loss": NaN, + "step": 1226, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1288, + "grad_norm": 1.5078125, + "learning_rate": 1.435829062224591e-05, + "long_answer_loss": 0.1288, + "loss": 0.1359, + "short_answer_loss": NaN, + "step": 1227, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1405, + "grad_norm": 1.5, + "learning_rate": 1.4342982328625774e-05, + "long_answer_loss": 0.1405, + "loss": 0.1336, + "short_answer_loss": NaN, + "step": 1228, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1062, + "grad_norm": 1.453125, + "learning_rate": 1.4327671208955082e-05, + "long_answer_loss": 0.1062, + "loss": 0.1299, + "short_answer_loss": NaN, + "step": 1229, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1304, + "grad_norm": 1.46875, + "learning_rate": 1.4312357286712085e-05, + "long_answer_loss": 0.1304, + "loss": 0.1324, + "short_answer_loss": NaN, + "step": 1230, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1071, + "grad_norm": 1.390625, + "learning_rate": 1.4297040585379332e-05, + "long_answer_loss": 0.1071, + "loss": 0.1306, + "short_answer_loss": NaN, + "step": 1231, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.129, + "grad_norm": 1.3515625, + "learning_rate": 1.4281721128443625e-05, + "long_answer_loss": 0.129, + "loss": 0.1285, + "short_answer_loss": NaN, + "step": 1232, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.135, + "grad_norm": 1.59375, + "learning_rate": 1.4266398939396006e-05, + "long_answer_loss": 0.135, + "loss": 0.1428, + "short_answer_loss": NaN, + "step": 1233, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1537, + "grad_norm": 1.4921875, + "learning_rate": 1.4251074041731694e-05, + "long_answer_loss": 0.1537, + "loss": 0.1393, + "short_answer_loss": NaN, + "step": 1234, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1304, + "grad_norm": 1.421875, + "learning_rate": 1.4235746458950061e-05, + "long_answer_loss": 0.1304, + "loss": 0.1339, + "short_answer_loss": NaN, + "step": 1235, + "template_loss": 0.0 + }, + { + "epoch": 0.94, + "full_loss": 0.1146, + "grad_norm": 1.40625, + "learning_rate": 1.422041621455461e-05, + "long_answer_loss": 0.1146, + "loss": 0.132, + "short_answer_loss": NaN, + "step": 1236, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1235, + "grad_norm": 1.4296875, + "learning_rate": 1.4205083332052906e-05, + "long_answer_loss": 0.1235, + "loss": 0.1316, + "short_answer_loss": NaN, + "step": 1237, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1267, + "grad_norm": 1.4296875, + "learning_rate": 1.4189747834956576e-05, + "long_answer_loss": 0.1267, + "loss": 0.1321, + "short_answer_loss": NaN, + "step": 1238, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1219, + "grad_norm": 1.484375, + "learning_rate": 1.4174409746781247e-05, + "long_answer_loss": 0.1219, + "loss": 0.1335, + "short_answer_loss": NaN, + "step": 1239, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1359, + "grad_norm": 1.5234375, + "learning_rate": 1.4159069091046526e-05, + "long_answer_loss": 0.1359, + "loss": 0.1457, + "short_answer_loss": NaN, + "step": 1240, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1002, + "grad_norm": 1.484375, + "learning_rate": 1.4143725891275946e-05, + "long_answer_loss": 0.1002, + "loss": 0.1367, + "short_answer_loss": NaN, + "step": 1241, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1268, + "grad_norm": 1.4140625, + "learning_rate": 1.412838017099696e-05, + "long_answer_loss": 0.1268, + "loss": 0.1263, + "short_answer_loss": NaN, + "step": 1242, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1261, + "grad_norm": 1.484375, + "learning_rate": 1.411303195374086e-05, + "long_answer_loss": 0.1261, + "loss": 0.1423, + "short_answer_loss": NaN, + "step": 1243, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1258, + "grad_norm": 1.4140625, + "learning_rate": 1.4097681263042789e-05, + "long_answer_loss": 0.1258, + "loss": 0.1346, + "short_answer_loss": NaN, + "step": 1244, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1439, + "grad_norm": 1.4453125, + "learning_rate": 1.4082328122441676e-05, + "long_answer_loss": 0.1439, + "loss": 0.1404, + "short_answer_loss": NaN, + "step": 1245, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1692, + "grad_norm": 1.484375, + "learning_rate": 1.4066972555480201e-05, + "long_answer_loss": 0.1692, + "loss": 0.1408, + "short_answer_loss": NaN, + "step": 1246, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1202, + "grad_norm": 1.3828125, + "learning_rate": 1.405161458570477e-05, + "long_answer_loss": 0.1202, + "loss": 0.1311, + "short_answer_loss": NaN, + "step": 1247, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1195, + "grad_norm": 1.421875, + "learning_rate": 1.4036254236665472e-05, + "long_answer_loss": 0.1195, + "loss": 0.137, + "short_answer_loss": NaN, + "step": 1248, + "template_loss": 0.0 + }, + { + "epoch": 0.95, + "full_loss": 0.1264, + "grad_norm": 1.4375, + "learning_rate": 1.4020891531916047e-05, + "long_answer_loss": 0.1264, + "loss": 0.1485, + "short_answer_loss": NaN, + "step": 1249, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1213, + "grad_norm": 1.3515625, + "learning_rate": 1.4005526495013848e-05, + "long_answer_loss": 0.1213, + "loss": 0.1246, + "short_answer_loss": NaN, + "step": 1250, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1144, + "grad_norm": 1.375, + "learning_rate": 1.3990159149519797e-05, + "long_answer_loss": 0.1144, + "loss": 0.1242, + "short_answer_loss": NaN, + "step": 1251, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1169, + "grad_norm": 1.40625, + "learning_rate": 1.397478951899836e-05, + "long_answer_loss": 0.1169, + "loss": 0.1403, + "short_answer_loss": NaN, + "step": 1252, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1254, + "grad_norm": 1.4375, + "learning_rate": 1.3959417627017507e-05, + "long_answer_loss": 0.1254, + "loss": 0.1304, + "short_answer_loss": NaN, + "step": 1253, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1331, + "grad_norm": 1.390625, + "learning_rate": 1.3944043497148682e-05, + "long_answer_loss": 0.1331, + "loss": 0.132, + "short_answer_loss": NaN, + "step": 1254, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1532, + "grad_norm": 1.453125, + "learning_rate": 1.3928667152966748e-05, + "long_answer_loss": 0.1532, + "loss": 0.1364, + "short_answer_loss": NaN, + "step": 1255, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1598, + "grad_norm": 1.484375, + "learning_rate": 1.3913288618049975e-05, + "long_answer_loss": 0.1598, + "loss": 0.1348, + "short_answer_loss": NaN, + "step": 1256, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1317, + "grad_norm": 1.390625, + "learning_rate": 1.3897907915979984e-05, + "long_answer_loss": 0.1317, + "loss": 0.1392, + "short_answer_loss": NaN, + "step": 1257, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1521, + "grad_norm": 1.4609375, + "learning_rate": 1.3882525070341725e-05, + "long_answer_loss": 0.1521, + "loss": 0.1376, + "short_answer_loss": NaN, + "step": 1258, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1554, + "grad_norm": 1.4296875, + "learning_rate": 1.3867140104723433e-05, + "long_answer_loss": 0.1554, + "loss": 0.1407, + "short_answer_loss": NaN, + "step": 1259, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1281, + "grad_norm": 1.421875, + "learning_rate": 1.385175304271659e-05, + "long_answer_loss": 0.1281, + "loss": 0.1437, + "short_answer_loss": NaN, + "step": 1260, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1242, + "grad_norm": 1.421875, + "learning_rate": 1.3836363907915894e-05, + "long_answer_loss": 0.1242, + "loss": 0.1281, + "short_answer_loss": NaN, + "step": 1261, + "template_loss": 0.0 + }, + { + "epoch": 0.96, + "full_loss": 0.1099, + "grad_norm": 1.390625, + "learning_rate": 1.3820972723919231e-05, + "long_answer_loss": 0.1099, + "loss": 0.1326, + "short_answer_loss": NaN, + "step": 1262, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1166, + "grad_norm": 1.3515625, + "learning_rate": 1.3805579514327616e-05, + "long_answer_loss": 0.1166, + "loss": 0.1307, + "short_answer_loss": NaN, + "step": 1263, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1346, + "grad_norm": 1.4921875, + "learning_rate": 1.379018430274518e-05, + "long_answer_loss": 0.1346, + "loss": 0.1366, + "short_answer_loss": NaN, + "step": 1264, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1454, + "grad_norm": 1.375, + "learning_rate": 1.3774787112779117e-05, + "long_answer_loss": 0.1454, + "loss": 0.1294, + "short_answer_loss": NaN, + "step": 1265, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1384, + "grad_norm": 1.4921875, + "learning_rate": 1.3759387968039658e-05, + "long_answer_loss": 0.1384, + "loss": 0.1391, + "short_answer_loss": NaN, + "step": 1266, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.134, + "grad_norm": 1.40625, + "learning_rate": 1.374398689214003e-05, + "long_answer_loss": 0.134, + "loss": 0.1397, + "short_answer_loss": NaN, + "step": 1267, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1302, + "grad_norm": 1.5234375, + "learning_rate": 1.3728583908696418e-05, + "long_answer_loss": 0.1302, + "loss": 0.142, + "short_answer_loss": NaN, + "step": 1268, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1454, + "grad_norm": 1.453125, + "learning_rate": 1.3713179041327946e-05, + "long_answer_loss": 0.1454, + "loss": 0.1338, + "short_answer_loss": NaN, + "step": 1269, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1291, + "grad_norm": 1.484375, + "learning_rate": 1.3697772313656607e-05, + "long_answer_loss": 0.1291, + "loss": 0.1334, + "short_answer_loss": NaN, + "step": 1270, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1658, + "grad_norm": 1.4375, + "learning_rate": 1.3682363749307261e-05, + "long_answer_loss": 0.1658, + "loss": 0.1341, + "short_answer_loss": NaN, + "step": 1271, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1131, + "grad_norm": 1.34375, + "learning_rate": 1.3666953371907584e-05, + "long_answer_loss": 0.1131, + "loss": 0.1291, + "short_answer_loss": NaN, + "step": 1272, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1271, + "grad_norm": 1.3671875, + "learning_rate": 1.3651541205088022e-05, + "long_answer_loss": 0.1271, + "loss": 0.131, + "short_answer_loss": NaN, + "step": 1273, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1428, + "grad_norm": 1.453125, + "learning_rate": 1.3636127272481772e-05, + "long_answer_loss": 0.1428, + "loss": 0.1287, + "short_answer_loss": NaN, + "step": 1274, + "template_loss": 0.0 + }, + { + "epoch": 0.97, + "full_loss": 0.1256, + "grad_norm": 1.375, + "learning_rate": 1.3620711597724739e-05, + "long_answer_loss": 0.1256, + "loss": 0.1322, + "short_answer_loss": NaN, + "step": 1275, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.14, + "grad_norm": 1.4453125, + "learning_rate": 1.3605294204455502e-05, + "long_answer_loss": 0.14, + "loss": 0.14, + "short_answer_loss": NaN, + "step": 1276, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1428, + "grad_norm": 1.3359375, + "learning_rate": 1.3589875116315259e-05, + "long_answer_loss": 0.1428, + "loss": 0.1293, + "short_answer_loss": NaN, + "step": 1277, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1218, + "grad_norm": 1.4765625, + "learning_rate": 1.3574454356947833e-05, + "long_answer_loss": 0.1218, + "loss": 0.1305, + "short_answer_loss": NaN, + "step": 1278, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1426, + "grad_norm": 1.328125, + "learning_rate": 1.3559031949999587e-05, + "long_answer_loss": 0.1426, + "loss": 0.129, + "short_answer_loss": NaN, + "step": 1279, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1252, + "grad_norm": 1.46875, + "learning_rate": 1.3543607919119425e-05, + "long_answer_loss": 0.1252, + "loss": 0.1349, + "short_answer_loss": NaN, + "step": 1280, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.142, + "grad_norm": 1.4453125, + "learning_rate": 1.3528182287958733e-05, + "long_answer_loss": 0.142, + "loss": 0.1457, + "short_answer_loss": NaN, + "step": 1281, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1327, + "grad_norm": 1.5, + "learning_rate": 1.3512755080171349e-05, + "long_answer_loss": 0.1327, + "loss": 0.1335, + "short_answer_loss": NaN, + "step": 1282, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1111, + "grad_norm": 1.46875, + "learning_rate": 1.3497326319413539e-05, + "long_answer_loss": 0.1111, + "loss": 0.1291, + "short_answer_loss": NaN, + "step": 1283, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1178, + "grad_norm": 1.3515625, + "learning_rate": 1.3481896029343943e-05, + "long_answer_loss": 0.1178, + "loss": 0.1198, + "short_answer_loss": NaN, + "step": 1284, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1229, + "grad_norm": 1.46875, + "learning_rate": 1.3466464233623546e-05, + "long_answer_loss": 0.1229, + "loss": 0.1341, + "short_answer_loss": NaN, + "step": 1285, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.124, + "grad_norm": 1.453125, + "learning_rate": 1.345103095591565e-05, + "long_answer_loss": 0.124, + "loss": 0.1314, + "short_answer_loss": NaN, + "step": 1286, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1297, + "grad_norm": 1.390625, + "learning_rate": 1.343559621988581e-05, + "long_answer_loss": 0.1297, + "loss": 0.1293, + "short_answer_loss": NaN, + "step": 1287, + "template_loss": 0.0 + }, + { + "epoch": 0.98, + "full_loss": 0.1322, + "grad_norm": 1.5, + "learning_rate": 1.3420160049201841e-05, + "long_answer_loss": 0.1322, + "loss": 0.1362, + "short_answer_loss": NaN, + "step": 1288, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1261, + "grad_norm": 1.4609375, + "learning_rate": 1.340472246753374e-05, + "long_answer_loss": 0.1261, + "loss": 0.1346, + "short_answer_loss": NaN, + "step": 1289, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1178, + "grad_norm": 1.4453125, + "learning_rate": 1.3389283498553678e-05, + "long_answer_loss": 0.1178, + "loss": 0.1348, + "short_answer_loss": NaN, + "step": 1290, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1238, + "grad_norm": 1.5, + "learning_rate": 1.3373843165935945e-05, + "long_answer_loss": 0.1238, + "loss": 0.1397, + "short_answer_loss": NaN, + "step": 1291, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1396, + "grad_norm": 1.4375, + "learning_rate": 1.3358401493356934e-05, + "long_answer_loss": 0.1396, + "loss": 0.133, + "short_answer_loss": NaN, + "step": 1292, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1232, + "grad_norm": 1.4296875, + "learning_rate": 1.3342958504495083e-05, + "long_answer_loss": 0.1232, + "loss": 0.1307, + "short_answer_loss": NaN, + "step": 1293, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1483, + "grad_norm": 1.4296875, + "learning_rate": 1.3327514223030845e-05, + "long_answer_loss": 0.1483, + "loss": 0.1354, + "short_answer_loss": NaN, + "step": 1294, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1183, + "grad_norm": 1.453125, + "learning_rate": 1.3312068672646671e-05, + "long_answer_loss": 0.1183, + "loss": 0.1431, + "short_answer_loss": NaN, + "step": 1295, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1229, + "grad_norm": 1.359375, + "learning_rate": 1.3296621877026938e-05, + "long_answer_loss": 0.1229, + "loss": 0.1384, + "short_answer_loss": NaN, + "step": 1296, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1218, + "grad_norm": 1.4296875, + "learning_rate": 1.3281173859857951e-05, + "long_answer_loss": 0.1218, + "loss": 0.1323, + "short_answer_loss": NaN, + "step": 1297, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1291, + "grad_norm": 1.421875, + "learning_rate": 1.3265724644827873e-05, + "long_answer_loss": 0.1291, + "loss": 0.1328, + "short_answer_loss": NaN, + "step": 1298, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1104, + "grad_norm": 1.5078125, + "learning_rate": 1.325027425562671e-05, + "long_answer_loss": 0.1104, + "loss": 0.1295, + "short_answer_loss": NaN, + "step": 1299, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.1227, + "grad_norm": 1.34375, + "learning_rate": 1.3234822715946272e-05, + "long_answer_loss": 0.1227, + "loss": 0.1301, + "short_answer_loss": NaN, + "step": 1300, + "template_loss": 0.0 + }, + { + "epoch": 0.99, + "full_loss": 0.131, + "grad_norm": 1.421875, + "learning_rate": 1.3219370049480128e-05, + "long_answer_loss": 0.131, + "loss": 0.1362, + "short_answer_loss": NaN, + "step": 1301, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1161, + "grad_norm": 1.4140625, + "learning_rate": 1.3203916279923579e-05, + "long_answer_loss": 0.1161, + "loss": 0.1242, + "short_answer_loss": NaN, + "step": 1302, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1381, + "grad_norm": 1.4296875, + "learning_rate": 1.3188461430973612e-05, + "long_answer_loss": 0.1381, + "loss": 0.1407, + "short_answer_loss": NaN, + "step": 1303, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.146, + "grad_norm": 1.4921875, + "learning_rate": 1.3173005526328875e-05, + "long_answer_loss": 0.146, + "loss": 0.1309, + "short_answer_loss": NaN, + "step": 1304, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1337, + "grad_norm": 1.4453125, + "learning_rate": 1.3157548589689625e-05, + "long_answer_loss": 0.1337, + "loss": 0.1295, + "short_answer_loss": NaN, + "step": 1305, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1376, + "grad_norm": 1.3515625, + "learning_rate": 1.3142090644757719e-05, + "long_answer_loss": 0.1376, + "loss": 0.125, + "short_answer_loss": NaN, + "step": 1306, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1283, + "grad_norm": 1.3671875, + "learning_rate": 1.3126631715236546e-05, + "long_answer_loss": 0.1283, + "loss": 0.1245, + "short_answer_loss": NaN, + "step": 1307, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.1443, + "grad_norm": 1.359375, + "learning_rate": 1.3111171824831004e-05, + "long_answer_loss": 0.1443, + "loss": 0.1283, + "short_answer_loss": NaN, + "step": 1308, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.0887, + "grad_norm": 1.28125, + "learning_rate": 1.3095710997247474e-05, + "long_answer_loss": 0.0887, + "loss": 0.0991, + "short_answer_loss": NaN, + "step": 1309, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.0883, + "grad_norm": 1.2265625, + "learning_rate": 1.3080249256193766e-05, + "long_answer_loss": 0.0883, + "loss": 0.0919, + "short_answer_loss": NaN, + "step": 1310, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.089, + "grad_norm": 1.1640625, + "learning_rate": 1.3064786625379096e-05, + "long_answer_loss": 0.089, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1311, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.0936, + "grad_norm": 1.2265625, + "learning_rate": 1.3049323128514041e-05, + "long_answer_loss": 0.0936, + "loss": 0.0863, + "short_answer_loss": NaN, + "step": 1312, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.0967, + "grad_norm": 1.3125, + "learning_rate": 1.3033858789310504e-05, + "long_answer_loss": 0.0967, + "loss": 0.093, + "short_answer_loss": NaN, + "step": 1313, + "template_loss": 0.0 + }, + { + "epoch": 1.0, + "full_loss": 0.0807, + "grad_norm": 1.3046875, + "learning_rate": 1.3018393631481686e-05, + "long_answer_loss": 0.0807, + "loss": 0.0824, + "short_answer_loss": NaN, + "step": 1314, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0766, + "grad_norm": 1.3984375, + "learning_rate": 1.3002927678742044e-05, + "long_answer_loss": 0.0766, + "loss": 0.0875, + "short_answer_loss": NaN, + "step": 1315, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.086, + "grad_norm": 1.3984375, + "learning_rate": 1.298746095480724e-05, + "long_answer_loss": 0.086, + "loss": 0.0831, + "short_answer_loss": NaN, + "step": 1316, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.1022, + "grad_norm": 1.5546875, + "learning_rate": 1.297199348339414e-05, + "long_answer_loss": 0.1022, + "loss": 0.087, + "short_answer_loss": NaN, + "step": 1317, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0888, + "grad_norm": 1.5234375, + "learning_rate": 1.2956525288220738e-05, + "long_answer_loss": 0.0888, + "loss": 0.0874, + "short_answer_loss": NaN, + "step": 1318, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.071, + "grad_norm": 1.578125, + "learning_rate": 1.2941056393006144e-05, + "long_answer_loss": 0.071, + "loss": 0.0926, + "short_answer_loss": NaN, + "step": 1319, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0929, + "grad_norm": 1.609375, + "learning_rate": 1.2925586821470542e-05, + "long_answer_loss": 0.0929, + "loss": 0.0822, + "short_answer_loss": NaN, + "step": 1320, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.07, + "grad_norm": 1.46875, + "learning_rate": 1.2910116597335157e-05, + "long_answer_loss": 0.07, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1321, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0737, + "grad_norm": 1.5703125, + "learning_rate": 1.2894645744322203e-05, + "long_answer_loss": 0.0737, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1322, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0703, + "grad_norm": 1.4375, + "learning_rate": 1.2879174286154874e-05, + "long_answer_loss": 0.0703, + "loss": 0.0835, + "short_answer_loss": NaN, + "step": 1323, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.1101, + "grad_norm": 1.578125, + "learning_rate": 1.2863702246557283e-05, + "long_answer_loss": 0.1101, + "loss": 0.0935, + "short_answer_loss": NaN, + "step": 1324, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0851, + "grad_norm": 1.4453125, + "learning_rate": 1.2848229649254435e-05, + "long_answer_loss": 0.0851, + "loss": 0.0848, + "short_answer_loss": NaN, + "step": 1325, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0851, + "grad_norm": 1.46875, + "learning_rate": 1.2832756517972185e-05, + "long_answer_loss": 0.0851, + "loss": 0.0859, + "short_answer_loss": NaN, + "step": 1326, + "template_loss": 0.0 + }, + { + "epoch": 1.01, + "full_loss": 0.0851, + "grad_norm": 1.3671875, + "learning_rate": 1.2817282876437223e-05, + "long_answer_loss": 0.0851, + "loss": 0.0809, + "short_answer_loss": NaN, + "step": 1327, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0889, + "grad_norm": 1.390625, + "learning_rate": 1.2801808748377e-05, + "long_answer_loss": 0.0889, + "loss": 0.0892, + "short_answer_loss": NaN, + "step": 1328, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.1083, + "grad_norm": 1.3984375, + "learning_rate": 1.2786334157519733e-05, + "long_answer_loss": 0.1083, + "loss": 0.0855, + "short_answer_loss": NaN, + "step": 1329, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.086, + "grad_norm": 1.359375, + "learning_rate": 1.2770859127594334e-05, + "long_answer_loss": 0.086, + "loss": 0.0908, + "short_answer_loss": NaN, + "step": 1330, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0867, + "grad_norm": 1.28125, + "learning_rate": 1.2755383682330394e-05, + "long_answer_loss": 0.0867, + "loss": 0.0827, + "short_answer_loss": NaN, + "step": 1331, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.1012, + "grad_norm": 1.53125, + "learning_rate": 1.2739907845458146e-05, + "long_answer_loss": 0.1012, + "loss": 0.0903, + "short_answer_loss": NaN, + "step": 1332, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0957, + "grad_norm": 1.34375, + "learning_rate": 1.2724431640708418e-05, + "long_answer_loss": 0.0957, + "loss": 0.0819, + "short_answer_loss": NaN, + "step": 1333, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0999, + "grad_norm": 1.4140625, + "learning_rate": 1.2708955091812593e-05, + "long_answer_loss": 0.0999, + "loss": 0.0926, + "short_answer_loss": NaN, + "step": 1334, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0748, + "grad_norm": 1.3828125, + "learning_rate": 1.2693478222502604e-05, + "long_answer_loss": 0.0748, + "loss": 0.083, + "short_answer_loss": NaN, + "step": 1335, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0813, + "grad_norm": 1.3203125, + "learning_rate": 1.2678001056510854e-05, + "long_answer_loss": 0.0813, + "loss": 0.0821, + "short_answer_loss": NaN, + "step": 1336, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.0686, + "grad_norm": 1.515625, + "learning_rate": 1.2662523617570213e-05, + "long_answer_loss": 0.0686, + "loss": 0.0884, + "short_answer_loss": NaN, + "step": 1337, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.1061, + "grad_norm": 1.390625, + "learning_rate": 1.2647045929413966e-05, + "long_answer_loss": 0.1061, + "loss": 0.0875, + "short_answer_loss": NaN, + "step": 1338, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.106, + "grad_norm": 1.3984375, + "learning_rate": 1.2631568015775777e-05, + "long_answer_loss": 0.106, + "loss": 0.0869, + "short_answer_loss": NaN, + "step": 1339, + "template_loss": 0.0 + }, + { + "epoch": 1.02, + "full_loss": 0.1007, + "grad_norm": 1.4609375, + "learning_rate": 1.2616089900389663e-05, + "long_answer_loss": 0.1007, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1340, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0857, + "grad_norm": 1.4453125, + "learning_rate": 1.2600611606989945e-05, + "long_answer_loss": 0.0857, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1341, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0922, + "grad_norm": 1.375, + "learning_rate": 1.2585133159311217e-05, + "long_answer_loss": 0.0922, + "loss": 0.0852, + "short_answer_loss": NaN, + "step": 1342, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0946, + "grad_norm": 1.3984375, + "learning_rate": 1.256965458108831e-05, + "long_answer_loss": 0.0946, + "loss": 0.0846, + "short_answer_loss": NaN, + "step": 1343, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.092, + "grad_norm": 1.3984375, + "learning_rate": 1.2554175896056259e-05, + "long_answer_loss": 0.092, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1344, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0707, + "grad_norm": 1.2578125, + "learning_rate": 1.2538697127950258e-05, + "long_answer_loss": 0.0707, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 1345, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0801, + "grad_norm": 1.3515625, + "learning_rate": 1.252321830050563e-05, + "long_answer_loss": 0.0801, + "loss": 0.0839, + "short_answer_loss": NaN, + "step": 1346, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0828, + "grad_norm": 1.390625, + "learning_rate": 1.2507739437457795e-05, + "long_answer_loss": 0.0828, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1347, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.1096, + "grad_norm": 1.4921875, + "learning_rate": 1.249226056254221e-05, + "long_answer_loss": 0.1096, + "loss": 0.0884, + "short_answer_loss": NaN, + "step": 1348, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0804, + "grad_norm": 1.3984375, + "learning_rate": 1.2476781699494372e-05, + "long_answer_loss": 0.0804, + "loss": 0.0852, + "short_answer_loss": NaN, + "step": 1349, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.083, + "grad_norm": 1.484375, + "learning_rate": 1.2461302872049741e-05, + "long_answer_loss": 0.083, + "loss": 0.0818, + "short_answer_loss": NaN, + "step": 1350, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0825, + "grad_norm": 1.328125, + "learning_rate": 1.2445824103943744e-05, + "long_answer_loss": 0.0825, + "loss": 0.0838, + "short_answer_loss": NaN, + "step": 1351, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.083, + "grad_norm": 1.390625, + "learning_rate": 1.243034541891169e-05, + "long_answer_loss": 0.083, + "loss": 0.0853, + "short_answer_loss": NaN, + "step": 1352, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0684, + "grad_norm": 1.359375, + "learning_rate": 1.2414866840688786e-05, + "long_answer_loss": 0.0684, + "loss": 0.0807, + "short_answer_loss": NaN, + "step": 1353, + "template_loss": 0.0 + }, + { + "epoch": 1.03, + "full_loss": 0.0827, + "grad_norm": 1.4453125, + "learning_rate": 1.239938839301006e-05, + "long_answer_loss": 0.0827, + "loss": 0.0833, + "short_answer_loss": NaN, + "step": 1354, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0866, + "grad_norm": 1.4375, + "learning_rate": 1.238391009961034e-05, + "long_answer_loss": 0.0866, + "loss": 0.0867, + "short_answer_loss": NaN, + "step": 1355, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0857, + "grad_norm": 1.3046875, + "learning_rate": 1.2368431984224226e-05, + "long_answer_loss": 0.0857, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1356, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0837, + "grad_norm": 1.4765625, + "learning_rate": 1.2352954070586036e-05, + "long_answer_loss": 0.0837, + "loss": 0.084, + "short_answer_loss": NaN, + "step": 1357, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0797, + "grad_norm": 1.34375, + "learning_rate": 1.2337476382429791e-05, + "long_answer_loss": 0.0797, + "loss": 0.0845, + "short_answer_loss": NaN, + "step": 1358, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0851, + "grad_norm": 1.390625, + "learning_rate": 1.2321998943489147e-05, + "long_answer_loss": 0.0851, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1359, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0974, + "grad_norm": 1.4140625, + "learning_rate": 1.23065217774974e-05, + "long_answer_loss": 0.0974, + "loss": 0.0881, + "short_answer_loss": NaN, + "step": 1360, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0969, + "grad_norm": 1.2890625, + "learning_rate": 1.2291044908187405e-05, + "long_answer_loss": 0.0969, + "loss": 0.0832, + "short_answer_loss": NaN, + "step": 1361, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0731, + "grad_norm": 1.421875, + "learning_rate": 1.2275568359291587e-05, + "long_answer_loss": 0.0731, + "loss": 0.0834, + "short_answer_loss": NaN, + "step": 1362, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.1057, + "grad_norm": 1.5078125, + "learning_rate": 1.2260092154541857e-05, + "long_answer_loss": 0.1057, + "loss": 0.0897, + "short_answer_loss": NaN, + "step": 1363, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0856, + "grad_norm": 1.484375, + "learning_rate": 1.2244616317669607e-05, + "long_answer_loss": 0.0856, + "loss": 0.0881, + "short_answer_loss": NaN, + "step": 1364, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0663, + "grad_norm": 1.359375, + "learning_rate": 1.2229140872405672e-05, + "long_answer_loss": 0.0663, + "loss": 0.0864, + "short_answer_loss": NaN, + "step": 1365, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0932, + "grad_norm": 1.359375, + "learning_rate": 1.2213665842480271e-05, + "long_answer_loss": 0.0932, + "loss": 0.0829, + "short_answer_loss": NaN, + "step": 1366, + "template_loss": 0.0 + }, + { + "epoch": 1.04, + "full_loss": 0.0804, + "grad_norm": 1.296875, + "learning_rate": 1.2198191251623006e-05, + "long_answer_loss": 0.0804, + "loss": 0.0845, + "short_answer_loss": NaN, + "step": 1367, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0857, + "grad_norm": 1.328125, + "learning_rate": 1.218271712356278e-05, + "long_answer_loss": 0.0857, + "loss": 0.0814, + "short_answer_loss": NaN, + "step": 1368, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0953, + "grad_norm": 1.4296875, + "learning_rate": 1.2167243482027816e-05, + "long_answer_loss": 0.0953, + "loss": 0.083, + "short_answer_loss": NaN, + "step": 1369, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0867, + "grad_norm": 1.609375, + "learning_rate": 1.2151770350745568e-05, + "long_answer_loss": 0.0867, + "loss": 0.0822, + "short_answer_loss": NaN, + "step": 1370, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0701, + "grad_norm": 1.3359375, + "learning_rate": 1.2136297753442721e-05, + "long_answer_loss": 0.0701, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1371, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.081, + "grad_norm": 1.3984375, + "learning_rate": 1.2120825713845125e-05, + "long_answer_loss": 0.081, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1372, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0932, + "grad_norm": 1.453125, + "learning_rate": 1.2105354255677798e-05, + "long_answer_loss": 0.0932, + "loss": 0.0843, + "short_answer_loss": NaN, + "step": 1373, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0708, + "grad_norm": 1.4375, + "learning_rate": 1.2089883402664851e-05, + "long_answer_loss": 0.0708, + "loss": 0.0843, + "short_answer_loss": NaN, + "step": 1374, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.08, + "grad_norm": 1.34375, + "learning_rate": 1.2074413178529461e-05, + "long_answer_loss": 0.08, + "loss": 0.0828, + "short_answer_loss": NaN, + "step": 1375, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0834, + "grad_norm": 1.375, + "learning_rate": 1.2058943606993861e-05, + "long_answer_loss": 0.0834, + "loss": 0.0835, + "short_answer_loss": NaN, + "step": 1376, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.081, + "grad_norm": 1.4453125, + "learning_rate": 1.2043474711779263e-05, + "long_answer_loss": 0.081, + "loss": 0.0861, + "short_answer_loss": NaN, + "step": 1377, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0948, + "grad_norm": 1.4609375, + "learning_rate": 1.2028006516605863e-05, + "long_answer_loss": 0.0948, + "loss": 0.0867, + "short_answer_loss": NaN, + "step": 1378, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.1134, + "grad_norm": 1.4140625, + "learning_rate": 1.2012539045192759e-05, + "long_answer_loss": 0.1134, + "loss": 0.079, + "short_answer_loss": NaN, + "step": 1379, + "template_loss": 0.0 + }, + { + "epoch": 1.05, + "full_loss": 0.0703, + "grad_norm": 1.46875, + "learning_rate": 1.199707232125796e-05, + "long_answer_loss": 0.0703, + "loss": 0.083, + "short_answer_loss": NaN, + "step": 1380, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.1042, + "grad_norm": 1.4375, + "learning_rate": 1.1981606368518313e-05, + "long_answer_loss": 0.1042, + "loss": 0.0879, + "short_answer_loss": NaN, + "step": 1381, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0727, + "grad_norm": 1.421875, + "learning_rate": 1.1966141210689497e-05, + "long_answer_loss": 0.0727, + "loss": 0.0842, + "short_answer_loss": NaN, + "step": 1382, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0665, + "grad_norm": 1.4375, + "learning_rate": 1.195067687148596e-05, + "long_answer_loss": 0.0665, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 1383, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0811, + "grad_norm": 1.4296875, + "learning_rate": 1.1935213374620907e-05, + "long_answer_loss": 0.0811, + "loss": 0.0875, + "short_answer_loss": NaN, + "step": 1384, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0858, + "grad_norm": 1.453125, + "learning_rate": 1.1919750743806239e-05, + "long_answer_loss": 0.0858, + "loss": 0.0847, + "short_answer_loss": NaN, + "step": 1385, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0793, + "grad_norm": 1.4296875, + "learning_rate": 1.1904289002752529e-05, + "long_answer_loss": 0.0793, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1386, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0769, + "grad_norm": 1.375, + "learning_rate": 1.1888828175169e-05, + "long_answer_loss": 0.0769, + "loss": 0.0839, + "short_answer_loss": NaN, + "step": 1387, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0846, + "grad_norm": 1.296875, + "learning_rate": 1.1873368284763457e-05, + "long_answer_loss": 0.0846, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1388, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0824, + "grad_norm": 1.4765625, + "learning_rate": 1.1857909355242283e-05, + "long_answer_loss": 0.0824, + "loss": 0.0823, + "short_answer_loss": NaN, + "step": 1389, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0772, + "grad_norm": 1.3984375, + "learning_rate": 1.1842451410310373e-05, + "long_answer_loss": 0.0772, + "loss": 0.0846, + "short_answer_loss": NaN, + "step": 1390, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0789, + "grad_norm": 1.40625, + "learning_rate": 1.182699447367113e-05, + "long_answer_loss": 0.0789, + "loss": 0.082, + "short_answer_loss": NaN, + "step": 1391, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0912, + "grad_norm": 1.40625, + "learning_rate": 1.1811538569026391e-05, + "long_answer_loss": 0.0912, + "loss": 0.0846, + "short_answer_loss": NaN, + "step": 1392, + "template_loss": 0.0 + }, + { + "epoch": 1.06, + "full_loss": 0.0715, + "grad_norm": 1.328125, + "learning_rate": 1.1796083720076426e-05, + "long_answer_loss": 0.0715, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1393, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0857, + "grad_norm": 1.375, + "learning_rate": 1.1780629950519875e-05, + "long_answer_loss": 0.0857, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 1394, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.098, + "grad_norm": 1.3671875, + "learning_rate": 1.1765177284053731e-05, + "long_answer_loss": 0.098, + "loss": 0.0819, + "short_answer_loss": NaN, + "step": 1395, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0813, + "grad_norm": 1.4453125, + "learning_rate": 1.1749725744373295e-05, + "long_answer_loss": 0.0813, + "loss": 0.0824, + "short_answer_loss": NaN, + "step": 1396, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0734, + "grad_norm": 1.328125, + "learning_rate": 1.173427535517213e-05, + "long_answer_loss": 0.0734, + "loss": 0.0822, + "short_answer_loss": NaN, + "step": 1397, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0958, + "grad_norm": 1.3359375, + "learning_rate": 1.1718826140142055e-05, + "long_answer_loss": 0.0958, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1398, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0984, + "grad_norm": 1.5625, + "learning_rate": 1.170337812297306e-05, + "long_answer_loss": 0.0984, + "loss": 0.083, + "short_answer_loss": NaN, + "step": 1399, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0702, + "grad_norm": 1.4296875, + "learning_rate": 1.1687931327353333e-05, + "long_answer_loss": 0.0702, + "loss": 0.0832, + "short_answer_loss": NaN, + "step": 1400, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0731, + "grad_norm": 1.421875, + "learning_rate": 1.1672485776969156e-05, + "long_answer_loss": 0.0731, + "loss": 0.0833, + "short_answer_loss": NaN, + "step": 1401, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0774, + "grad_norm": 1.453125, + "learning_rate": 1.1657041495504922e-05, + "long_answer_loss": 0.0774, + "loss": 0.0851, + "short_answer_loss": NaN, + "step": 1402, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0871, + "grad_norm": 1.375, + "learning_rate": 1.1641598506643066e-05, + "long_answer_loss": 0.0871, + "loss": 0.0804, + "short_answer_loss": NaN, + "step": 1403, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0885, + "grad_norm": 1.390625, + "learning_rate": 1.1626156834064057e-05, + "long_answer_loss": 0.0885, + "loss": 0.0873, + "short_answer_loss": NaN, + "step": 1404, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0761, + "grad_norm": 1.46875, + "learning_rate": 1.1610716501446328e-05, + "long_answer_loss": 0.0761, + "loss": 0.0845, + "short_answer_loss": NaN, + "step": 1405, + "template_loss": 0.0 + }, + { + "epoch": 1.07, + "full_loss": 0.0856, + "grad_norm": 1.421875, + "learning_rate": 1.1595277532466262e-05, + "long_answer_loss": 0.0856, + "loss": 0.0853, + "short_answer_loss": NaN, + "step": 1406, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0693, + "grad_norm": 1.328125, + "learning_rate": 1.1579839950798165e-05, + "long_answer_loss": 0.0693, + "loss": 0.079, + "short_answer_loss": NaN, + "step": 1407, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0878, + "grad_norm": 1.4453125, + "learning_rate": 1.1564403780114192e-05, + "long_answer_loss": 0.0878, + "loss": 0.0861, + "short_answer_loss": NaN, + "step": 1408, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0636, + "grad_norm": 1.3828125, + "learning_rate": 1.1548969044084358e-05, + "long_answer_loss": 0.0636, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1409, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0819, + "grad_norm": 1.46875, + "learning_rate": 1.1533535766376454e-05, + "long_answer_loss": 0.0819, + "loss": 0.0812, + "short_answer_loss": NaN, + "step": 1410, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0836, + "grad_norm": 1.4609375, + "learning_rate": 1.151810397065606e-05, + "long_answer_loss": 0.0836, + "loss": 0.0805, + "short_answer_loss": NaN, + "step": 1411, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0822, + "grad_norm": 1.4453125, + "learning_rate": 1.150267368058646e-05, + "long_answer_loss": 0.0822, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1412, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0863, + "grad_norm": 1.4765625, + "learning_rate": 1.1487244919828654e-05, + "long_answer_loss": 0.0863, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 1413, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0829, + "grad_norm": 1.53125, + "learning_rate": 1.1471817712041272e-05, + "long_answer_loss": 0.0829, + "loss": 0.0805, + "short_answer_loss": NaN, + "step": 1414, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0806, + "grad_norm": 1.5703125, + "learning_rate": 1.1456392080880578e-05, + "long_answer_loss": 0.0806, + "loss": 0.0871, + "short_answer_loss": NaN, + "step": 1415, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0796, + "grad_norm": 1.546875, + "learning_rate": 1.1440968050000416e-05, + "long_answer_loss": 0.0796, + "loss": 0.0835, + "short_answer_loss": NaN, + "step": 1416, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0786, + "grad_norm": 1.53125, + "learning_rate": 1.1425545643052171e-05, + "long_answer_loss": 0.0786, + "loss": 0.0887, + "short_answer_loss": NaN, + "step": 1417, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0843, + "grad_norm": 1.4375, + "learning_rate": 1.1410124883684744e-05, + "long_answer_loss": 0.0843, + "loss": 0.0801, + "short_answer_loss": NaN, + "step": 1418, + "template_loss": 0.0 + }, + { + "epoch": 1.08, + "full_loss": 0.0787, + "grad_norm": 1.390625, + "learning_rate": 1.1394705795544503e-05, + "long_answer_loss": 0.0787, + "loss": 0.0826, + "short_answer_loss": NaN, + "step": 1419, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.079, + "grad_norm": 1.390625, + "learning_rate": 1.1379288402275264e-05, + "long_answer_loss": 0.079, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1420, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.076, + "grad_norm": 1.5546875, + "learning_rate": 1.1363872727518226e-05, + "long_answer_loss": 0.076, + "loss": 0.0867, + "short_answer_loss": NaN, + "step": 1421, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0726, + "grad_norm": 1.5859375, + "learning_rate": 1.134845879491198e-05, + "long_answer_loss": 0.0726, + "loss": 0.0856, + "short_answer_loss": NaN, + "step": 1422, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0805, + "grad_norm": 1.53125, + "learning_rate": 1.1333046628092417e-05, + "long_answer_loss": 0.0805, + "loss": 0.0829, + "short_answer_loss": NaN, + "step": 1423, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0978, + "grad_norm": 1.375, + "learning_rate": 1.131763625069274e-05, + "long_answer_loss": 0.0978, + "loss": 0.081, + "short_answer_loss": NaN, + "step": 1424, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0826, + "grad_norm": 1.4140625, + "learning_rate": 1.1302227686343398e-05, + "long_answer_loss": 0.0826, + "loss": 0.0831, + "short_answer_loss": NaN, + "step": 1425, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.075, + "grad_norm": 1.4375, + "learning_rate": 1.1286820958672057e-05, + "long_answer_loss": 0.075, + "loss": 0.0831, + "short_answer_loss": NaN, + "step": 1426, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0636, + "grad_norm": 1.4609375, + "learning_rate": 1.1271416091303586e-05, + "long_answer_loss": 0.0636, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 1427, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.092, + "grad_norm": 1.40625, + "learning_rate": 1.1256013107859974e-05, + "long_answer_loss": 0.092, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1428, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0747, + "grad_norm": 1.421875, + "learning_rate": 1.1240612031960347e-05, + "long_answer_loss": 0.0747, + "loss": 0.0831, + "short_answer_loss": NaN, + "step": 1429, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0879, + "grad_norm": 1.453125, + "learning_rate": 1.1225212887220886e-05, + "long_answer_loss": 0.0879, + "loss": 0.0781, + "short_answer_loss": NaN, + "step": 1430, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0784, + "grad_norm": 1.4921875, + "learning_rate": 1.1209815697254825e-05, + "long_answer_loss": 0.0784, + "loss": 0.088, + "short_answer_loss": NaN, + "step": 1431, + "template_loss": 0.0 + }, + { + "epoch": 1.09, + "full_loss": 0.0798, + "grad_norm": 1.375, + "learning_rate": 1.1194420485672384e-05, + "long_answer_loss": 0.0798, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 1432, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0801, + "grad_norm": 1.46875, + "learning_rate": 1.1179027276080772e-05, + "long_answer_loss": 0.0801, + "loss": 0.0804, + "short_answer_loss": NaN, + "step": 1433, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0786, + "grad_norm": 1.546875, + "learning_rate": 1.1163636092084105e-05, + "long_answer_loss": 0.0786, + "loss": 0.0817, + "short_answer_loss": NaN, + "step": 1434, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0794, + "grad_norm": 1.3671875, + "learning_rate": 1.1148246957283415e-05, + "long_answer_loss": 0.0794, + "loss": 0.085, + "short_answer_loss": NaN, + "step": 1435, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0681, + "grad_norm": 1.4765625, + "learning_rate": 1.1132859895276574e-05, + "long_answer_loss": 0.0681, + "loss": 0.0845, + "short_answer_loss": NaN, + "step": 1436, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0926, + "grad_norm": 1.5703125, + "learning_rate": 1.1117474929658276e-05, + "long_answer_loss": 0.0926, + "loss": 0.0817, + "short_answer_loss": NaN, + "step": 1437, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0912, + "grad_norm": 1.4765625, + "learning_rate": 1.1102092084020018e-05, + "long_answer_loss": 0.0912, + "loss": 0.081, + "short_answer_loss": NaN, + "step": 1438, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0596, + "grad_norm": 1.390625, + "learning_rate": 1.1086711381950026e-05, + "long_answer_loss": 0.0596, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1439, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0739, + "grad_norm": 1.515625, + "learning_rate": 1.1071332847033255e-05, + "long_answer_loss": 0.0739, + "loss": 0.0828, + "short_answer_loss": NaN, + "step": 1440, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0889, + "grad_norm": 1.46875, + "learning_rate": 1.105595650285132e-05, + "long_answer_loss": 0.0889, + "loss": 0.0825, + "short_answer_loss": NaN, + "step": 1441, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0858, + "grad_norm": 1.421875, + "learning_rate": 1.1040582372982494e-05, + "long_answer_loss": 0.0858, + "loss": 0.0803, + "short_answer_loss": NaN, + "step": 1442, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0918, + "grad_norm": 1.453125, + "learning_rate": 1.1025210481001642e-05, + "long_answer_loss": 0.0918, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1443, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0775, + "grad_norm": 1.5078125, + "learning_rate": 1.1009840850480207e-05, + "long_answer_loss": 0.0775, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1444, + "template_loss": 0.0 + }, + { + "epoch": 1.1, + "full_loss": 0.0748, + "grad_norm": 1.4453125, + "learning_rate": 1.0994473504986155e-05, + "long_answer_loss": 0.0748, + "loss": 0.0802, + "short_answer_loss": NaN, + "step": 1445, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0944, + "grad_norm": 1.515625, + "learning_rate": 1.0979108468083956e-05, + "long_answer_loss": 0.0944, + "loss": 0.0855, + "short_answer_loss": NaN, + "step": 1446, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0746, + "grad_norm": 1.390625, + "learning_rate": 1.0963745763334533e-05, + "long_answer_loss": 0.0746, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 1447, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.073, + "grad_norm": 1.34375, + "learning_rate": 1.0948385414295235e-05, + "long_answer_loss": 0.073, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1448, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0751, + "grad_norm": 1.5, + "learning_rate": 1.0933027444519805e-05, + "long_answer_loss": 0.0751, + "loss": 0.0826, + "short_answer_loss": NaN, + "step": 1449, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0762, + "grad_norm": 1.484375, + "learning_rate": 1.0917671877558327e-05, + "long_answer_loss": 0.0762, + "loss": 0.0865, + "short_answer_loss": NaN, + "step": 1450, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0766, + "grad_norm": 1.4296875, + "learning_rate": 1.0902318736957214e-05, + "long_answer_loss": 0.0766, + "loss": 0.0794, + "short_answer_loss": NaN, + "step": 1451, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0849, + "grad_norm": 1.46875, + "learning_rate": 1.0886968046259141e-05, + "long_answer_loss": 0.0849, + "loss": 0.0826, + "short_answer_loss": NaN, + "step": 1452, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0906, + "grad_norm": 1.3515625, + "learning_rate": 1.0871619829003044e-05, + "long_answer_loss": 0.0906, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1453, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0765, + "grad_norm": 1.375, + "learning_rate": 1.0856274108724052e-05, + "long_answer_loss": 0.0765, + "loss": 0.0821, + "short_answer_loss": NaN, + "step": 1454, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0873, + "grad_norm": 1.328125, + "learning_rate": 1.0840930908953477e-05, + "long_answer_loss": 0.0873, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1455, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0871, + "grad_norm": 1.484375, + "learning_rate": 1.0825590253218758e-05, + "long_answer_loss": 0.0871, + "loss": 0.0815, + "short_answer_loss": NaN, + "step": 1456, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0745, + "grad_norm": 1.421875, + "learning_rate": 1.0810252165043427e-05, + "long_answer_loss": 0.0745, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1457, + "template_loss": 0.0 + }, + { + "epoch": 1.11, + "full_loss": 0.0729, + "grad_norm": 1.4375, + "learning_rate": 1.07949166679471e-05, + "long_answer_loss": 0.0729, + "loss": 0.0842, + "short_answer_loss": NaN, + "step": 1458, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0866, + "grad_norm": 1.3671875, + "learning_rate": 1.0779583785445393e-05, + "long_answer_loss": 0.0866, + "loss": 0.0836, + "short_answer_loss": NaN, + "step": 1459, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0715, + "grad_norm": 1.390625, + "learning_rate": 1.0764253541049941e-05, + "long_answer_loss": 0.0715, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1460, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.1092, + "grad_norm": 1.4453125, + "learning_rate": 1.074892595826831e-05, + "long_answer_loss": 0.1092, + "loss": 0.0886, + "short_answer_loss": NaN, + "step": 1461, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0715, + "grad_norm": 1.5078125, + "learning_rate": 1.0733601060603999e-05, + "long_answer_loss": 0.0715, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1462, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0765, + "grad_norm": 1.4140625, + "learning_rate": 1.0718278871556374e-05, + "long_answer_loss": 0.0765, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1463, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0692, + "grad_norm": 1.3828125, + "learning_rate": 1.0702959414620673e-05, + "long_answer_loss": 0.0692, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 1464, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0881, + "grad_norm": 1.390625, + "learning_rate": 1.0687642713287916e-05, + "long_answer_loss": 0.0881, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1465, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.086, + "grad_norm": 1.5234375, + "learning_rate": 1.0672328791044921e-05, + "long_answer_loss": 0.086, + "loss": 0.0837, + "short_answer_loss": NaN, + "step": 1466, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.065, + "grad_norm": 1.390625, + "learning_rate": 1.0657017671374233e-05, + "long_answer_loss": 0.065, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1467, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0787, + "grad_norm": 1.4140625, + "learning_rate": 1.0641709377754094e-05, + "long_answer_loss": 0.0787, + "loss": 0.0812, + "short_answer_loss": NaN, + "step": 1468, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0766, + "grad_norm": 1.484375, + "learning_rate": 1.0626403933658426e-05, + "long_answer_loss": 0.0766, + "loss": 0.0834, + "short_answer_loss": NaN, + "step": 1469, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0778, + "grad_norm": 1.421875, + "learning_rate": 1.0611101362556773e-05, + "long_answer_loss": 0.0778, + "loss": 0.0834, + "short_answer_loss": NaN, + "step": 1470, + "template_loss": 0.0 + }, + { + "epoch": 1.12, + "full_loss": 0.0884, + "grad_norm": 1.4140625, + "learning_rate": 1.059580168791428e-05, + "long_answer_loss": 0.0884, + "loss": 0.082, + "short_answer_loss": NaN, + "step": 1471, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0676, + "grad_norm": 1.4296875, + "learning_rate": 1.0580504933191635e-05, + "long_answer_loss": 0.0676, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1472, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0868, + "grad_norm": 1.3515625, + "learning_rate": 1.0565211121845075e-05, + "long_answer_loss": 0.0868, + "loss": 0.0803, + "short_answer_loss": NaN, + "step": 1473, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.075, + "grad_norm": 1.4140625, + "learning_rate": 1.0549920277326293e-05, + "long_answer_loss": 0.075, + "loss": 0.079, + "short_answer_loss": NaN, + "step": 1474, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.08, + "grad_norm": 1.4609375, + "learning_rate": 1.0534632423082462e-05, + "long_answer_loss": 0.08, + "loss": 0.0826, + "short_answer_loss": NaN, + "step": 1475, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0609, + "grad_norm": 1.390625, + "learning_rate": 1.051934758255615e-05, + "long_answer_loss": 0.0609, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 1476, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0834, + "grad_norm": 1.4296875, + "learning_rate": 1.0504065779185302e-05, + "long_answer_loss": 0.0834, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1477, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0688, + "grad_norm": 1.3671875, + "learning_rate": 1.0488787036403226e-05, + "long_answer_loss": 0.0688, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1478, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0802, + "grad_norm": 1.40625, + "learning_rate": 1.0473511377638512e-05, + "long_answer_loss": 0.0802, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1479, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0994, + "grad_norm": 1.4765625, + "learning_rate": 1.0458238826315041e-05, + "long_answer_loss": 0.0994, + "loss": 0.0807, + "short_answer_loss": NaN, + "step": 1480, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0845, + "grad_norm": 1.375, + "learning_rate": 1.0442969405851917e-05, + "long_answer_loss": 0.0845, + "loss": 0.0852, + "short_answer_loss": NaN, + "step": 1481, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0796, + "grad_norm": 1.3359375, + "learning_rate": 1.0427703139663453e-05, + "long_answer_loss": 0.0796, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1482, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0865, + "grad_norm": 1.4296875, + "learning_rate": 1.041244005115911e-05, + "long_answer_loss": 0.0865, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 1483, + "template_loss": 0.0 + }, + { + "epoch": 1.13, + "full_loss": 0.0864, + "grad_norm": 1.40625, + "learning_rate": 1.0397180163743494e-05, + "long_answer_loss": 0.0864, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1484, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0764, + "grad_norm": 1.4375, + "learning_rate": 1.0381923500816288e-05, + "long_answer_loss": 0.0764, + "loss": 0.0811, + "short_answer_loss": NaN, + "step": 1485, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.07, + "grad_norm": 1.5078125, + "learning_rate": 1.036667008577224e-05, + "long_answer_loss": 0.07, + "loss": 0.0849, + "short_answer_loss": NaN, + "step": 1486, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0768, + "grad_norm": 1.3984375, + "learning_rate": 1.0351419942001115e-05, + "long_answer_loss": 0.0768, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1487, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0831, + "grad_norm": 1.4453125, + "learning_rate": 1.0336173092887655e-05, + "long_answer_loss": 0.0831, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1488, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0695, + "grad_norm": 1.375, + "learning_rate": 1.0320929561811564e-05, + "long_answer_loss": 0.0695, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1489, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0723, + "grad_norm": 1.4375, + "learning_rate": 1.0305689372147442e-05, + "long_answer_loss": 0.0723, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1490, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0876, + "grad_norm": 1.5234375, + "learning_rate": 1.029045254726478e-05, + "long_answer_loss": 0.0876, + "loss": 0.0837, + "short_answer_loss": NaN, + "step": 1491, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0644, + "grad_norm": 1.5, + "learning_rate": 1.0275219110527898e-05, + "long_answer_loss": 0.0644, + "loss": 0.0803, + "short_answer_loss": NaN, + "step": 1492, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0864, + "grad_norm": 1.546875, + "learning_rate": 1.025998908529593e-05, + "long_answer_loss": 0.0864, + "loss": 0.0799, + "short_answer_loss": NaN, + "step": 1493, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0661, + "grad_norm": 1.390625, + "learning_rate": 1.0244762494922766e-05, + "long_answer_loss": 0.0661, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 1494, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0757, + "grad_norm": 1.3671875, + "learning_rate": 1.0229539362757046e-05, + "long_answer_loss": 0.0757, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 1495, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0947, + "grad_norm": 1.4609375, + "learning_rate": 1.021431971214209e-05, + "long_answer_loss": 0.0947, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1496, + "template_loss": 0.0 + }, + { + "epoch": 1.14, + "full_loss": 0.0871, + "grad_norm": 1.4296875, + "learning_rate": 1.0199103566415896e-05, + "long_answer_loss": 0.0871, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1497, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0774, + "grad_norm": 1.4296875, + "learning_rate": 1.0183890948911074e-05, + "long_answer_loss": 0.0774, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1498, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.073, + "grad_norm": 1.375, + "learning_rate": 1.0168681882954825e-05, + "long_answer_loss": 0.073, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1499, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0767, + "grad_norm": 1.390625, + "learning_rate": 1.0153476391868917e-05, + "long_answer_loss": 0.0767, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 1500, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0713, + "grad_norm": 1.3671875, + "learning_rate": 1.0138274498969614e-05, + "long_answer_loss": 0.0713, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1501, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0754, + "grad_norm": 1.3984375, + "learning_rate": 1.012307622756769e-05, + "long_answer_loss": 0.0754, + "loss": 0.0819, + "short_answer_loss": NaN, + "step": 1502, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0941, + "grad_norm": 1.59375, + "learning_rate": 1.010788160096834e-05, + "long_answer_loss": 0.0941, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1503, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0829, + "grad_norm": 1.4765625, + "learning_rate": 1.009269064247119e-05, + "long_answer_loss": 0.0829, + "loss": 0.085, + "short_answer_loss": NaN, + "step": 1504, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0668, + "grad_norm": 1.3984375, + "learning_rate": 1.0077503375370226e-05, + "long_answer_loss": 0.0668, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1505, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.07, + "grad_norm": 1.421875, + "learning_rate": 1.0062319822953787e-05, + "long_answer_loss": 0.07, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 1506, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0695, + "grad_norm": 1.3671875, + "learning_rate": 1.0047140008504499e-05, + "long_answer_loss": 0.0695, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1507, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0883, + "grad_norm": 1.484375, + "learning_rate": 1.0031963955299272e-05, + "long_answer_loss": 0.0883, + "loss": 0.0858, + "short_answer_loss": NaN, + "step": 1508, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0757, + "grad_norm": 1.4375, + "learning_rate": 1.0016791686609248e-05, + "long_answer_loss": 0.0757, + "loss": 0.0823, + "short_answer_loss": NaN, + "step": 1509, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0724, + "grad_norm": 1.515625, + "learning_rate": 1.0001623225699747e-05, + "long_answer_loss": 0.0724, + "loss": 0.0838, + "short_answer_loss": NaN, + "step": 1510, + "template_loss": 0.0 + }, + { + "epoch": 1.15, + "full_loss": 0.0758, + "grad_norm": 1.3515625, + "learning_rate": 9.986458595830275e-06, + "long_answer_loss": 0.0758, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1511, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0864, + "grad_norm": 1.4765625, + "learning_rate": 9.971297820254447e-06, + "long_answer_loss": 0.0864, + "loss": 0.086, + "short_answer_loss": NaN, + "step": 1512, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0776, + "grad_norm": 1.375, + "learning_rate": 9.956140922219975e-06, + "long_answer_loss": 0.0776, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1513, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0826, + "grad_norm": 1.5390625, + "learning_rate": 9.940987924968623e-06, + "long_answer_loss": 0.0826, + "loss": 0.0811, + "short_answer_loss": NaN, + "step": 1514, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0727, + "grad_norm": 1.4765625, + "learning_rate": 9.925838851736172e-06, + "long_answer_loss": 0.0727, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1515, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0954, + "grad_norm": 1.3828125, + "learning_rate": 9.910693725752384e-06, + "long_answer_loss": 0.0954, + "loss": 0.078, + "short_answer_loss": NaN, + "step": 1516, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.07, + "grad_norm": 1.609375, + "learning_rate": 9.895552570240979e-06, + "long_answer_loss": 0.07, + "loss": 0.0794, + "short_answer_loss": NaN, + "step": 1517, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0809, + "grad_norm": 1.4140625, + "learning_rate": 9.880415408419577e-06, + "long_answer_loss": 0.0809, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1518, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0741, + "grad_norm": 1.453125, + "learning_rate": 9.865282263499672e-06, + "long_answer_loss": 0.0741, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 1519, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0694, + "grad_norm": 1.46875, + "learning_rate": 9.850153158686617e-06, + "long_answer_loss": 0.0694, + "loss": 0.0808, + "short_answer_loss": NaN, + "step": 1520, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0843, + "grad_norm": 1.4453125, + "learning_rate": 9.835028117179549e-06, + "long_answer_loss": 0.0843, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 1521, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0758, + "grad_norm": 1.546875, + "learning_rate": 9.819907162171385e-06, + "long_answer_loss": 0.0758, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 1522, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0814, + "grad_norm": 1.5390625, + "learning_rate": 9.80479031684877e-06, + "long_answer_loss": 0.0814, + "loss": 0.0849, + "short_answer_loss": NaN, + "step": 1523, + "template_loss": 0.0 + }, + { + "epoch": 1.16, + "full_loss": 0.0871, + "grad_norm": 1.546875, + "learning_rate": 9.789677604392058e-06, + "long_answer_loss": 0.0871, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1524, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0672, + "grad_norm": 1.3671875, + "learning_rate": 9.77456904797525e-06, + "long_answer_loss": 0.0672, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1525, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0726, + "grad_norm": 1.4765625, + "learning_rate": 9.75946467076599e-06, + "long_answer_loss": 0.0726, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1526, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0676, + "grad_norm": 1.375, + "learning_rate": 9.7443644959255e-06, + "long_answer_loss": 0.0676, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 1527, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0762, + "grad_norm": 1.515625, + "learning_rate": 9.729268546608565e-06, + "long_answer_loss": 0.0762, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1528, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0729, + "grad_norm": 1.5546875, + "learning_rate": 9.714176845963494e-06, + "long_answer_loss": 0.0729, + "loss": 0.0807, + "short_answer_loss": NaN, + "step": 1529, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0747, + "grad_norm": 1.453125, + "learning_rate": 9.69908941713207e-06, + "long_answer_loss": 0.0747, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 1530, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0722, + "grad_norm": 1.421875, + "learning_rate": 9.684006283249536e-06, + "long_answer_loss": 0.0722, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1531, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.078, + "grad_norm": 1.3984375, + "learning_rate": 9.668927467444538e-06, + "long_answer_loss": 0.078, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1532, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0885, + "grad_norm": 1.4140625, + "learning_rate": 9.65385299283912e-06, + "long_answer_loss": 0.0885, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1533, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0729, + "grad_norm": 1.5078125, + "learning_rate": 9.638782882548645e-06, + "long_answer_loss": 0.0729, + "loss": 0.0824, + "short_answer_loss": NaN, + "step": 1534, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0716, + "grad_norm": 1.4375, + "learning_rate": 9.623717159681805e-06, + "long_answer_loss": 0.0716, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1535, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0669, + "grad_norm": 1.4765625, + "learning_rate": 9.60865584734055e-06, + "long_answer_loss": 0.0669, + "loss": 0.0836, + "short_answer_loss": NaN, + "step": 1536, + "template_loss": 0.0 + }, + { + "epoch": 1.17, + "full_loss": 0.0793, + "grad_norm": 1.484375, + "learning_rate": 9.593598968620072e-06, + "long_answer_loss": 0.0793, + "loss": 0.0828, + "short_answer_loss": NaN, + "step": 1537, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0736, + "grad_norm": 1.421875, + "learning_rate": 9.578546546608766e-06, + "long_answer_loss": 0.0736, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1538, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0655, + "grad_norm": 1.546875, + "learning_rate": 9.563498604388183e-06, + "long_answer_loss": 0.0655, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1539, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0734, + "grad_norm": 1.390625, + "learning_rate": 9.548455165033023e-06, + "long_answer_loss": 0.0734, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 1540, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0869, + "grad_norm": 1.3671875, + "learning_rate": 9.533416251611064e-06, + "long_answer_loss": 0.0869, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1541, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0851, + "grad_norm": 1.421875, + "learning_rate": 9.51838188718316e-06, + "long_answer_loss": 0.0851, + "loss": 0.0826, + "short_answer_loss": NaN, + "step": 1542, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0894, + "grad_norm": 1.46875, + "learning_rate": 9.50335209480317e-06, + "long_answer_loss": 0.0894, + "loss": 0.0804, + "short_answer_loss": NaN, + "step": 1543, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0673, + "grad_norm": 1.3125, + "learning_rate": 9.48832689751796e-06, + "long_answer_loss": 0.0673, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1544, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0924, + "grad_norm": 1.53125, + "learning_rate": 9.473306318367334e-06, + "long_answer_loss": 0.0924, + "loss": 0.0839, + "short_answer_loss": NaN, + "step": 1545, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0821, + "grad_norm": 1.5625, + "learning_rate": 9.458290380384033e-06, + "long_answer_loss": 0.0821, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 1546, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0648, + "grad_norm": 1.3203125, + "learning_rate": 9.443279106593663e-06, + "long_answer_loss": 0.0648, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1547, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0897, + "grad_norm": 1.4296875, + "learning_rate": 9.428272520014691e-06, + "long_answer_loss": 0.0897, + "loss": 0.0816, + "short_answer_loss": NaN, + "step": 1548, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0823, + "grad_norm": 1.4921875, + "learning_rate": 9.413270643658393e-06, + "long_answer_loss": 0.0823, + "loss": 0.0818, + "short_answer_loss": NaN, + "step": 1549, + "template_loss": 0.0 + }, + { + "epoch": 1.18, + "full_loss": 0.0818, + "grad_norm": 1.421875, + "learning_rate": 9.398273500528811e-06, + "long_answer_loss": 0.0818, + "loss": 0.0846, + "short_answer_loss": NaN, + "step": 1550, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0732, + "grad_norm": 1.3828125, + "learning_rate": 9.383281113622753e-06, + "long_answer_loss": 0.0732, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1551, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0822, + "grad_norm": 1.46875, + "learning_rate": 9.368293505929707e-06, + "long_answer_loss": 0.0822, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 1552, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0738, + "grad_norm": 1.5, + "learning_rate": 9.353310700431852e-06, + "long_answer_loss": 0.0738, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1553, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0777, + "grad_norm": 1.4609375, + "learning_rate": 9.33833272010399e-06, + "long_answer_loss": 0.0777, + "loss": 0.0828, + "short_answer_loss": NaN, + "step": 1554, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0756, + "grad_norm": 1.3828125, + "learning_rate": 9.323359587913542e-06, + "long_answer_loss": 0.0756, + "loss": 0.0796, + "short_answer_loss": NaN, + "step": 1555, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0692, + "grad_norm": 1.5390625, + "learning_rate": 9.308391326820467e-06, + "long_answer_loss": 0.0692, + "loss": 0.0838, + "short_answer_loss": NaN, + "step": 1556, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0729, + "grad_norm": 1.3671875, + "learning_rate": 9.293427959777288e-06, + "long_answer_loss": 0.0729, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1557, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0667, + "grad_norm": 1.421875, + "learning_rate": 9.278469509728996e-06, + "long_answer_loss": 0.0667, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1558, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.1008, + "grad_norm": 1.5234375, + "learning_rate": 9.263515999613054e-06, + "long_answer_loss": 0.1008, + "loss": 0.0837, + "short_answer_loss": NaN, + "step": 1559, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0704, + "grad_norm": 1.3671875, + "learning_rate": 9.248567452359351e-06, + "long_answer_loss": 0.0704, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 1560, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0713, + "grad_norm": 1.4453125, + "learning_rate": 9.233623890890155e-06, + "long_answer_loss": 0.0713, + "loss": 0.0774, + "short_answer_loss": NaN, + "step": 1561, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0754, + "grad_norm": 1.515625, + "learning_rate": 9.218685338120109e-06, + "long_answer_loss": 0.0754, + "loss": 0.0841, + "short_answer_loss": NaN, + "step": 1562, + "template_loss": 0.0 + }, + { + "epoch": 1.19, + "full_loss": 0.0767, + "grad_norm": 1.4296875, + "learning_rate": 9.203751816956152e-06, + "long_answer_loss": 0.0767, + "loss": 0.0827, + "short_answer_loss": NaN, + "step": 1563, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0826, + "grad_norm": 1.4140625, + "learning_rate": 9.188823350297532e-06, + "long_answer_loss": 0.0826, + "loss": 0.0837, + "short_answer_loss": NaN, + "step": 1564, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0763, + "grad_norm": 1.3828125, + "learning_rate": 9.173899961035722e-06, + "long_answer_loss": 0.0763, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 1565, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0696, + "grad_norm": 1.3671875, + "learning_rate": 9.158981672054427e-06, + "long_answer_loss": 0.0696, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1566, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.085, + "grad_norm": 1.375, + "learning_rate": 9.144068506229524e-06, + "long_answer_loss": 0.085, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1567, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0872, + "grad_norm": 1.46875, + "learning_rate": 9.129160486429037e-06, + "long_answer_loss": 0.0872, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1568, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.077, + "grad_norm": 1.375, + "learning_rate": 9.114257635513093e-06, + "long_answer_loss": 0.077, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 1569, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0749, + "grad_norm": 1.421875, + "learning_rate": 9.099359976333893e-06, + "long_answer_loss": 0.0749, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1570, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0914, + "grad_norm": 1.4140625, + "learning_rate": 9.084467531735694e-06, + "long_answer_loss": 0.0914, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1571, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0904, + "grad_norm": 1.4140625, + "learning_rate": 9.06958032455473e-06, + "long_answer_loss": 0.0904, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 1572, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0669, + "grad_norm": 1.40625, + "learning_rate": 9.054698377619227e-06, + "long_answer_loss": 0.0669, + "loss": 0.0796, + "short_answer_loss": NaN, + "step": 1573, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0885, + "grad_norm": 1.3671875, + "learning_rate": 9.039821713749335e-06, + "long_answer_loss": 0.0885, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1574, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0885, + "grad_norm": 1.40625, + "learning_rate": 9.024950355757101e-06, + "long_answer_loss": 0.0885, + "loss": 0.0807, + "short_answer_loss": NaN, + "step": 1575, + "template_loss": 0.0 + }, + { + "epoch": 1.2, + "full_loss": 0.0827, + "grad_norm": 1.359375, + "learning_rate": 9.010084326446435e-06, + "long_answer_loss": 0.0827, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1576, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0735, + "grad_norm": 1.40625, + "learning_rate": 8.995223648613088e-06, + "long_answer_loss": 0.0735, + "loss": 0.0796, + "short_answer_loss": NaN, + "step": 1577, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0607, + "grad_norm": 1.359375, + "learning_rate": 8.980368345044587e-06, + "long_answer_loss": 0.0607, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 1578, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0662, + "grad_norm": 1.3828125, + "learning_rate": 8.965518438520238e-06, + "long_answer_loss": 0.0662, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1579, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0684, + "grad_norm": 1.4453125, + "learning_rate": 8.950673951811053e-06, + "long_answer_loss": 0.0684, + "loss": 0.084, + "short_answer_loss": NaN, + "step": 1580, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0806, + "grad_norm": 1.375, + "learning_rate": 8.93583490767974e-06, + "long_answer_loss": 0.0806, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1581, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0884, + "grad_norm": 1.5390625, + "learning_rate": 8.921001328880665e-06, + "long_answer_loss": 0.0884, + "loss": 0.0819, + "short_answer_loss": NaN, + "step": 1582, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0896, + "grad_norm": 1.4140625, + "learning_rate": 8.906173238159807e-06, + "long_answer_loss": 0.0896, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1583, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0696, + "grad_norm": 1.4296875, + "learning_rate": 8.89135065825474e-06, + "long_answer_loss": 0.0696, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1584, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0711, + "grad_norm": 1.375, + "learning_rate": 8.87653361189457e-06, + "long_answer_loss": 0.0711, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 1585, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0657, + "grad_norm": 1.484375, + "learning_rate": 8.861722121799942e-06, + "long_answer_loss": 0.0657, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1586, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0796, + "grad_norm": 1.578125, + "learning_rate": 8.846916210682951e-06, + "long_answer_loss": 0.0796, + "loss": 0.0807, + "short_answer_loss": NaN, + "step": 1587, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.0682, + "grad_norm": 1.3984375, + "learning_rate": 8.83211590124717e-06, + "long_answer_loss": 0.0682, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1588, + "template_loss": 0.0 + }, + { + "epoch": 1.21, + "full_loss": 0.074, + "grad_norm": 1.421875, + "learning_rate": 8.817321216187557e-06, + "long_answer_loss": 0.074, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1589, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0709, + "grad_norm": 1.390625, + "learning_rate": 8.802532178190453e-06, + "long_answer_loss": 0.0709, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1590, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.077, + "grad_norm": 1.453125, + "learning_rate": 8.787748809933546e-06, + "long_answer_loss": 0.077, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1591, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0866, + "grad_norm": 1.453125, + "learning_rate": 8.772971134085817e-06, + "long_answer_loss": 0.0866, + "loss": 0.079, + "short_answer_loss": NaN, + "step": 1592, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0719, + "grad_norm": 1.40625, + "learning_rate": 8.758199173307535e-06, + "long_answer_loss": 0.0719, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1593, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0687, + "grad_norm": 1.390625, + "learning_rate": 8.743432950250188e-06, + "long_answer_loss": 0.0687, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1594, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0776, + "grad_norm": 1.546875, + "learning_rate": 8.728672487556486e-06, + "long_answer_loss": 0.0776, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1595, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.066, + "grad_norm": 1.3984375, + "learning_rate": 8.713917807860284e-06, + "long_answer_loss": 0.066, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 1596, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0795, + "grad_norm": 1.40625, + "learning_rate": 8.699168933786584e-06, + "long_answer_loss": 0.0795, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 1597, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0724, + "grad_norm": 1.4921875, + "learning_rate": 8.684425887951477e-06, + "long_answer_loss": 0.0724, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1598, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0805, + "grad_norm": 1.4296875, + "learning_rate": 8.669688692962128e-06, + "long_answer_loss": 0.0805, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1599, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0714, + "grad_norm": 1.375, + "learning_rate": 8.654957371416722e-06, + "long_answer_loss": 0.0714, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1600, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.071, + "grad_norm": 1.4140625, + "learning_rate": 8.640231945904429e-06, + "long_answer_loss": 0.071, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1601, + "template_loss": 0.0 + }, + { + "epoch": 1.22, + "full_loss": 0.0719, + "grad_norm": 1.4453125, + "learning_rate": 8.625512439005401e-06, + "long_answer_loss": 0.0719, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1602, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0836, + "grad_norm": 1.4609375, + "learning_rate": 8.610798873290694e-06, + "long_answer_loss": 0.0836, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1603, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0761, + "grad_norm": 1.375, + "learning_rate": 8.596091271322262e-06, + "long_answer_loss": 0.0761, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1604, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0805, + "grad_norm": 1.4453125, + "learning_rate": 8.581389655652914e-06, + "long_answer_loss": 0.0805, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 1605, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0727, + "grad_norm": 1.375, + "learning_rate": 8.566694048826282e-06, + "long_answer_loss": 0.0727, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1606, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0762, + "grad_norm": 1.3515625, + "learning_rate": 8.55200447337677e-06, + "long_answer_loss": 0.0762, + "loss": 0.078, + "short_answer_loss": NaN, + "step": 1607, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0932, + "grad_norm": 1.375, + "learning_rate": 8.537320951829556e-06, + "long_answer_loss": 0.0932, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1608, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0736, + "grad_norm": 1.3828125, + "learning_rate": 8.522643506700511e-06, + "long_answer_loss": 0.0736, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1609, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.092, + "grad_norm": 1.4140625, + "learning_rate": 8.507972160496213e-06, + "long_answer_loss": 0.092, + "loss": 0.0802, + "short_answer_loss": NaN, + "step": 1610, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0819, + "grad_norm": 1.484375, + "learning_rate": 8.493306935713872e-06, + "long_answer_loss": 0.0819, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1611, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.081, + "grad_norm": 1.3515625, + "learning_rate": 8.478647854841304e-06, + "long_answer_loss": 0.081, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 1612, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0652, + "grad_norm": 1.3984375, + "learning_rate": 8.463994940356926e-06, + "long_answer_loss": 0.0652, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 1613, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0663, + "grad_norm": 1.4296875, + "learning_rate": 8.449348214729678e-06, + "long_answer_loss": 0.0663, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1614, + "template_loss": 0.0 + }, + { + "epoch": 1.23, + "full_loss": 0.0695, + "grad_norm": 1.390625, + "learning_rate": 8.434707700419028e-06, + "long_answer_loss": 0.0695, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 1615, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0718, + "grad_norm": 1.4453125, + "learning_rate": 8.420073419874905e-06, + "long_answer_loss": 0.0718, + "loss": 0.0789, + "short_answer_loss": NaN, + "step": 1616, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0677, + "grad_norm": 1.46875, + "learning_rate": 8.405445395537692e-06, + "long_answer_loss": 0.0677, + "loss": 0.0803, + "short_answer_loss": NaN, + "step": 1617, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0772, + "grad_norm": 1.734375, + "learning_rate": 8.390823649838164e-06, + "long_answer_loss": 0.0772, + "loss": 0.0892, + "short_answer_loss": NaN, + "step": 1618, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0532, + "grad_norm": 1.4140625, + "learning_rate": 8.376208205197484e-06, + "long_answer_loss": 0.0532, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 1619, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0623, + "grad_norm": 1.40625, + "learning_rate": 8.361599084027136e-06, + "long_answer_loss": 0.0623, + "loss": 0.0765, + "short_answer_loss": NaN, + "step": 1620, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0779, + "grad_norm": 1.3984375, + "learning_rate": 8.346996308728922e-06, + "long_answer_loss": 0.0779, + "loss": 0.0771, + "short_answer_loss": NaN, + "step": 1621, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0653, + "grad_norm": 1.4296875, + "learning_rate": 8.33239990169491e-06, + "long_answer_loss": 0.0653, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 1622, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0766, + "grad_norm": 1.484375, + "learning_rate": 8.31780988530739e-06, + "long_answer_loss": 0.0766, + "loss": 0.0805, + "short_answer_loss": NaN, + "step": 1623, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0784, + "grad_norm": 1.46875, + "learning_rate": 8.303226281938875e-06, + "long_answer_loss": 0.0784, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1624, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0739, + "grad_norm": 1.4453125, + "learning_rate": 8.288649113952025e-06, + "long_answer_loss": 0.0739, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 1625, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0635, + "grad_norm": 1.453125, + "learning_rate": 8.274078403699642e-06, + "long_answer_loss": 0.0635, + "loss": 0.0781, + "short_answer_loss": NaN, + "step": 1626, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0746, + "grad_norm": 1.46875, + "learning_rate": 8.25951417352462e-06, + "long_answer_loss": 0.0746, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 1627, + "template_loss": 0.0 + }, + { + "epoch": 1.24, + "full_loss": 0.0599, + "grad_norm": 1.3125, + "learning_rate": 8.244956445759928e-06, + "long_answer_loss": 0.0599, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 1628, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0597, + "grad_norm": 1.4453125, + "learning_rate": 8.23040524272854e-06, + "long_answer_loss": 0.0597, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1629, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0814, + "grad_norm": 1.5, + "learning_rate": 8.21586058674345e-06, + "long_answer_loss": 0.0814, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1630, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0666, + "grad_norm": 1.4453125, + "learning_rate": 8.201322500107606e-06, + "long_answer_loss": 0.0666, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1631, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0777, + "grad_norm": 1.4296875, + "learning_rate": 8.186791005113866e-06, + "long_answer_loss": 0.0777, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1632, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0824, + "grad_norm": 1.515625, + "learning_rate": 8.172266124045009e-06, + "long_answer_loss": 0.0824, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 1633, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0808, + "grad_norm": 1.4609375, + "learning_rate": 8.157747879173646e-06, + "long_answer_loss": 0.0808, + "loss": 0.0817, + "short_answer_loss": NaN, + "step": 1634, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0852, + "grad_norm": 1.390625, + "learning_rate": 8.143236292762229e-06, + "long_answer_loss": 0.0852, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1635, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0769, + "grad_norm": 1.3515625, + "learning_rate": 8.128731387062986e-06, + "long_answer_loss": 0.0769, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 1636, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0694, + "grad_norm": 1.453125, + "learning_rate": 8.114233184317918e-06, + "long_answer_loss": 0.0694, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 1637, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0749, + "grad_norm": 1.328125, + "learning_rate": 8.099741706758726e-06, + "long_answer_loss": 0.0749, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1638, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0646, + "grad_norm": 1.421875, + "learning_rate": 8.085256976606825e-06, + "long_answer_loss": 0.0646, + "loss": 0.0785, + "short_answer_loss": NaN, + "step": 1639, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0623, + "grad_norm": 1.3359375, + "learning_rate": 8.070779016073256e-06, + "long_answer_loss": 0.0623, + "loss": 0.0805, + "short_answer_loss": NaN, + "step": 1640, + "template_loss": 0.0 + }, + { + "epoch": 1.25, + "full_loss": 0.0639, + "grad_norm": 1.34375, + "learning_rate": 8.056307847358701e-06, + "long_answer_loss": 0.0639, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 1641, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0624, + "grad_norm": 1.390625, + "learning_rate": 8.041843492653411e-06, + "long_answer_loss": 0.0624, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 1642, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0897, + "grad_norm": 1.390625, + "learning_rate": 8.0273859741372e-06, + "long_answer_loss": 0.0897, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1643, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.084, + "grad_norm": 1.3671875, + "learning_rate": 8.012935313979398e-06, + "long_answer_loss": 0.084, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1644, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0779, + "grad_norm": 1.4453125, + "learning_rate": 7.998491534338807e-06, + "long_answer_loss": 0.0779, + "loss": 0.0798, + "short_answer_loss": NaN, + "step": 1645, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0888, + "grad_norm": 1.4375, + "learning_rate": 7.984054657363696e-06, + "long_answer_loss": 0.0888, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 1646, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0887, + "grad_norm": 1.421875, + "learning_rate": 7.96962470519173e-06, + "long_answer_loss": 0.0887, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 1647, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0645, + "grad_norm": 1.453125, + "learning_rate": 7.95520169994998e-06, + "long_answer_loss": 0.0645, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 1648, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0737, + "grad_norm": 1.4140625, + "learning_rate": 7.940785663754837e-06, + "long_answer_loss": 0.0737, + "loss": 0.0805, + "short_answer_loss": NaN, + "step": 1649, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0875, + "grad_norm": 1.4140625, + "learning_rate": 7.926376618712027e-06, + "long_answer_loss": 0.0875, + "loss": 0.0799, + "short_answer_loss": NaN, + "step": 1650, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0746, + "grad_norm": 1.390625, + "learning_rate": 7.911974586916543e-06, + "long_answer_loss": 0.0746, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1651, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0704, + "grad_norm": 1.3984375, + "learning_rate": 7.897579590452625e-06, + "long_answer_loss": 0.0704, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 1652, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0743, + "grad_norm": 1.5703125, + "learning_rate": 7.883191651393737e-06, + "long_answer_loss": 0.0743, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1653, + "template_loss": 0.0 + }, + { + "epoch": 1.26, + "full_loss": 0.0652, + "grad_norm": 1.4375, + "learning_rate": 7.868810791802503e-06, + "long_answer_loss": 0.0652, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 1654, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0738, + "grad_norm": 1.3671875, + "learning_rate": 7.85443703373071e-06, + "long_answer_loss": 0.0738, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 1655, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0888, + "grad_norm": 1.4375, + "learning_rate": 7.84007039921924e-06, + "long_answer_loss": 0.0888, + "loss": 0.0819, + "short_answer_loss": NaN, + "step": 1656, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0782, + "grad_norm": 1.390625, + "learning_rate": 7.82571091029806e-06, + "long_answer_loss": 0.0782, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1657, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0793, + "grad_norm": 1.3828125, + "learning_rate": 7.811358588986167e-06, + "long_answer_loss": 0.0793, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 1658, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0771, + "grad_norm": 1.3984375, + "learning_rate": 7.797013457291596e-06, + "long_answer_loss": 0.0771, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1659, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0918, + "grad_norm": 1.3828125, + "learning_rate": 7.782675537211323e-06, + "long_answer_loss": 0.0918, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1660, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0704, + "grad_norm": 1.40625, + "learning_rate": 7.768344850731293e-06, + "long_answer_loss": 0.0704, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1661, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0665, + "grad_norm": 1.3125, + "learning_rate": 7.754021419826344e-06, + "long_answer_loss": 0.0665, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 1662, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0666, + "grad_norm": 1.421875, + "learning_rate": 7.739705266460182e-06, + "long_answer_loss": 0.0666, + "loss": 0.0785, + "short_answer_loss": NaN, + "step": 1663, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0852, + "grad_norm": 1.4609375, + "learning_rate": 7.725396412585378e-06, + "long_answer_loss": 0.0852, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 1664, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0709, + "grad_norm": 1.390625, + "learning_rate": 7.711094880143286e-06, + "long_answer_loss": 0.0709, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1665, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0749, + "grad_norm": 1.4765625, + "learning_rate": 7.696800691064047e-06, + "long_answer_loss": 0.0749, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1666, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0613, + "grad_norm": 1.375, + "learning_rate": 7.682513867266528e-06, + "long_answer_loss": 0.0613, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 1667, + "template_loss": 0.0 + }, + { + "epoch": 1.27, + "full_loss": 0.0689, + "grad_norm": 1.4453125, + "learning_rate": 7.668234430658325e-06, + "long_answer_loss": 0.0689, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1668, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0632, + "grad_norm": 1.390625, + "learning_rate": 7.653962403135678e-06, + "long_answer_loss": 0.0632, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1669, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.084, + "grad_norm": 1.3203125, + "learning_rate": 7.639697806583493e-06, + "long_answer_loss": 0.084, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1670, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0795, + "grad_norm": 1.3671875, + "learning_rate": 7.625440662875258e-06, + "long_answer_loss": 0.0795, + "loss": 0.0778, + "short_answer_loss": NaN, + "step": 1671, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.1012, + "grad_norm": 1.4375, + "learning_rate": 7.611190993873052e-06, + "long_answer_loss": 0.1012, + "loss": 0.0809, + "short_answer_loss": NaN, + "step": 1672, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0862, + "grad_norm": 1.4375, + "learning_rate": 7.596948821427477e-06, + "long_answer_loss": 0.0862, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1673, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0783, + "grad_norm": 1.40625, + "learning_rate": 7.582714167377644e-06, + "long_answer_loss": 0.0783, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 1674, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0754, + "grad_norm": 1.3515625, + "learning_rate": 7.568487053551146e-06, + "long_answer_loss": 0.0754, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1675, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0815, + "grad_norm": 1.3671875, + "learning_rate": 7.554267501763993e-06, + "long_answer_loss": 0.0815, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 1676, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0601, + "grad_norm": 1.40625, + "learning_rate": 7.540055533820625e-06, + "long_answer_loss": 0.0601, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 1677, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0544, + "grad_norm": 1.3125, + "learning_rate": 7.525851171513828e-06, + "long_answer_loss": 0.0544, + "loss": 0.0683, + "short_answer_loss": NaN, + "step": 1678, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0844, + "grad_norm": 1.4921875, + "learning_rate": 7.51165443662474e-06, + "long_answer_loss": 0.0844, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1679, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0779, + "grad_norm": 1.4296875, + "learning_rate": 7.497465350922802e-06, + "long_answer_loss": 0.0779, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 1680, + "template_loss": 0.0 + }, + { + "epoch": 1.28, + "full_loss": 0.0749, + "grad_norm": 1.484375, + "learning_rate": 7.483283936165725e-06, + "long_answer_loss": 0.0749, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1681, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0689, + "grad_norm": 1.4609375, + "learning_rate": 7.469110214099448e-06, + "long_answer_loss": 0.0689, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1682, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0852, + "grad_norm": 1.453125, + "learning_rate": 7.454944206458123e-06, + "long_answer_loss": 0.0852, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1683, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0885, + "grad_norm": 1.4453125, + "learning_rate": 7.440785934964077e-06, + "long_answer_loss": 0.0885, + "loss": 0.082, + "short_answer_loss": NaN, + "step": 1684, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0694, + "grad_norm": 1.53125, + "learning_rate": 7.42663542132776e-06, + "long_answer_loss": 0.0694, + "loss": 0.0811, + "short_answer_loss": NaN, + "step": 1685, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0914, + "grad_norm": 1.4921875, + "learning_rate": 7.412492687247744e-06, + "long_answer_loss": 0.0914, + "loss": 0.0827, + "short_answer_loss": NaN, + "step": 1686, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0654, + "grad_norm": 1.40625, + "learning_rate": 7.398357754410653e-06, + "long_answer_loss": 0.0654, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 1687, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0701, + "grad_norm": 1.546875, + "learning_rate": 7.384230644491163e-06, + "long_answer_loss": 0.0701, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1688, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0637, + "grad_norm": 1.3984375, + "learning_rate": 7.370111379151943e-06, + "long_answer_loss": 0.0637, + "loss": 0.0724, + "short_answer_loss": NaN, + "step": 1689, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0687, + "grad_norm": 1.390625, + "learning_rate": 7.355999980043648e-06, + "long_answer_loss": 0.0687, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1690, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0837, + "grad_norm": 1.4140625, + "learning_rate": 7.341896468804853e-06, + "long_answer_loss": 0.0837, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 1691, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0973, + "grad_norm": 1.40625, + "learning_rate": 7.327800867062054e-06, + "long_answer_loss": 0.0973, + "loss": 0.0789, + "short_answer_loss": NaN, + "step": 1692, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0448, + "grad_norm": 1.421875, + "learning_rate": 7.313713196429606e-06, + "long_answer_loss": 0.0448, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1693, + "template_loss": 0.0 + }, + { + "epoch": 1.29, + "full_loss": 0.0771, + "grad_norm": 1.4140625, + "learning_rate": 7.2996334785097055e-06, + "long_answer_loss": 0.0771, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1694, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0806, + "grad_norm": 1.4453125, + "learning_rate": 7.285561734892357e-06, + "long_answer_loss": 0.0806, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1695, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0876, + "grad_norm": 1.40625, + "learning_rate": 7.27149798715534e-06, + "long_answer_loss": 0.0876, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1696, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0584, + "grad_norm": 1.4296875, + "learning_rate": 7.2574422568641635e-06, + "long_answer_loss": 0.0584, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1697, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0629, + "grad_norm": 1.421875, + "learning_rate": 7.243394565572051e-06, + "long_answer_loss": 0.0629, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1698, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0778, + "grad_norm": 1.4140625, + "learning_rate": 7.2293549348199e-06, + "long_answer_loss": 0.0778, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 1699, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0836, + "grad_norm": 1.3515625, + "learning_rate": 7.21532338613623e-06, + "long_answer_loss": 0.0836, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 1700, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0734, + "grad_norm": 1.453125, + "learning_rate": 7.201299941037199e-06, + "long_answer_loss": 0.0734, + "loss": 0.0827, + "short_answer_loss": NaN, + "step": 1701, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0595, + "grad_norm": 1.3359375, + "learning_rate": 7.187284621026508e-06, + "long_answer_loss": 0.0595, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 1702, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0779, + "grad_norm": 1.3515625, + "learning_rate": 7.173277447595414e-06, + "long_answer_loss": 0.0779, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1703, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0778, + "grad_norm": 1.3828125, + "learning_rate": 7.159278442222683e-06, + "long_answer_loss": 0.0778, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1704, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0756, + "grad_norm": 1.3046875, + "learning_rate": 7.14528762637455e-06, + "long_answer_loss": 0.0756, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1705, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0649, + "grad_norm": 1.3671875, + "learning_rate": 7.131305021504697e-06, + "long_answer_loss": 0.0649, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1706, + "template_loss": 0.0 + }, + { + "epoch": 1.3, + "full_loss": 0.0722, + "grad_norm": 1.4453125, + "learning_rate": 7.117330649054213e-06, + "long_answer_loss": 0.0722, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1707, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0833, + "grad_norm": 1.4375, + "learning_rate": 7.103364530451567e-06, + "long_answer_loss": 0.0833, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 1708, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0821, + "grad_norm": 1.5625, + "learning_rate": 7.089406687112554e-06, + "long_answer_loss": 0.0821, + "loss": 0.08, + "short_answer_loss": NaN, + "step": 1709, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0648, + "grad_norm": 1.390625, + "learning_rate": 7.075457140440312e-06, + "long_answer_loss": 0.0648, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1710, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0715, + "grad_norm": 1.3359375, + "learning_rate": 7.06151591182522e-06, + "long_answer_loss": 0.0715, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 1711, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0784, + "grad_norm": 1.3828125, + "learning_rate": 7.047583022644938e-06, + "long_answer_loss": 0.0784, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 1712, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0715, + "grad_norm": 1.4453125, + "learning_rate": 7.033658494264309e-06, + "long_answer_loss": 0.0715, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1713, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0836, + "grad_norm": 1.4765625, + "learning_rate": 7.0197423480353685e-06, + "long_answer_loss": 0.0836, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 1714, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0661, + "grad_norm": 1.4140625, + "learning_rate": 7.005834605297303e-06, + "long_answer_loss": 0.0661, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1715, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0971, + "grad_norm": 1.515625, + "learning_rate": 6.9919352873763915e-06, + "long_answer_loss": 0.0971, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1716, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0748, + "grad_norm": 1.4453125, + "learning_rate": 6.978044415586032e-06, + "long_answer_loss": 0.0748, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 1717, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0727, + "grad_norm": 1.40625, + "learning_rate": 6.9641620112266284e-06, + "long_answer_loss": 0.0727, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1718, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0756, + "grad_norm": 1.4296875, + "learning_rate": 6.9502880955856385e-06, + "long_answer_loss": 0.0756, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1719, + "template_loss": 0.0 + }, + { + "epoch": 1.31, + "full_loss": 0.0645, + "grad_norm": 1.4296875, + "learning_rate": 6.936422689937475e-06, + "long_answer_loss": 0.0645, + "loss": 0.0801, + "short_answer_loss": NaN, + "step": 1720, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0733, + "grad_norm": 1.359375, + "learning_rate": 6.9225658155435146e-06, + "long_answer_loss": 0.0733, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 1721, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0721, + "grad_norm": 1.3984375, + "learning_rate": 6.9087174936520505e-06, + "long_answer_loss": 0.0721, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 1722, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0769, + "grad_norm": 1.484375, + "learning_rate": 6.89487774549826e-06, + "long_answer_loss": 0.0769, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1723, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0686, + "grad_norm": 1.5, + "learning_rate": 6.88104659230418e-06, + "long_answer_loss": 0.0686, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 1724, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0856, + "grad_norm": 1.4453125, + "learning_rate": 6.867224055278648e-06, + "long_answer_loss": 0.0856, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1725, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0745, + "grad_norm": 1.34375, + "learning_rate": 6.853410155617321e-06, + "long_answer_loss": 0.0745, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 1726, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0689, + "grad_norm": 1.4375, + "learning_rate": 6.839604914502577e-06, + "long_answer_loss": 0.0689, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 1727, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0613, + "grad_norm": 1.375, + "learning_rate": 6.825808353103542e-06, + "long_answer_loss": 0.0613, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1728, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0746, + "grad_norm": 1.390625, + "learning_rate": 6.812020492576024e-06, + "long_answer_loss": 0.0746, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 1729, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0736, + "grad_norm": 1.4765625, + "learning_rate": 6.798241354062484e-06, + "long_answer_loss": 0.0736, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 1730, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0878, + "grad_norm": 1.46875, + "learning_rate": 6.784470958692018e-06, + "long_answer_loss": 0.0878, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 1731, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0788, + "grad_norm": 1.3671875, + "learning_rate": 6.77070932758031e-06, + "long_answer_loss": 0.0788, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1732, + "template_loss": 0.0 + }, + { + "epoch": 1.32, + "full_loss": 0.0836, + "grad_norm": 1.421875, + "learning_rate": 6.75695648182961e-06, + "long_answer_loss": 0.0836, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 1733, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.076, + "grad_norm": 1.3828125, + "learning_rate": 6.743212442528673e-06, + "long_answer_loss": 0.076, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1734, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0818, + "grad_norm": 1.34375, + "learning_rate": 6.729477230752796e-06, + "long_answer_loss": 0.0818, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 1735, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0648, + "grad_norm": 1.640625, + "learning_rate": 6.715750867563692e-06, + "long_answer_loss": 0.0648, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 1736, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0825, + "grad_norm": 1.484375, + "learning_rate": 6.7020333740095305e-06, + "long_answer_loss": 0.0825, + "loss": 0.0778, + "short_answer_loss": NaN, + "step": 1737, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0636, + "grad_norm": 1.3125, + "learning_rate": 6.688324771124881e-06, + "long_answer_loss": 0.0636, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 1738, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0849, + "grad_norm": 1.4765625, + "learning_rate": 6.67462507993067e-06, + "long_answer_loss": 0.0849, + "loss": 0.0797, + "short_answer_loss": NaN, + "step": 1739, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0857, + "grad_norm": 1.4765625, + "learning_rate": 6.660934321434166e-06, + "long_answer_loss": 0.0857, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1740, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0784, + "grad_norm": 1.4140625, + "learning_rate": 6.647252516628936e-06, + "long_answer_loss": 0.0784, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 1741, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0727, + "grad_norm": 1.4609375, + "learning_rate": 6.63357968649482e-06, + "long_answer_loss": 0.0727, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1742, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.054, + "grad_norm": 1.3515625, + "learning_rate": 6.619915851997899e-06, + "long_answer_loss": 0.054, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 1743, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0541, + "grad_norm": 1.40625, + "learning_rate": 6.606261034090446e-06, + "long_answer_loss": 0.0541, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1744, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.0677, + "grad_norm": 1.5390625, + "learning_rate": 6.592615253710922e-06, + "long_answer_loss": 0.0677, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1745, + "template_loss": 0.0 + }, + { + "epoch": 1.33, + "full_loss": 0.079, + "grad_norm": 1.4765625, + "learning_rate": 6.5789785317839275e-06, + "long_answer_loss": 0.079, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 1746, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0649, + "grad_norm": 1.4296875, + "learning_rate": 6.5653508892201675e-06, + "long_answer_loss": 0.0649, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1747, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0674, + "grad_norm": 1.3125, + "learning_rate": 6.551732346916431e-06, + "long_answer_loss": 0.0674, + "loss": 0.0649, + "short_answer_loss": NaN, + "step": 1748, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0663, + "grad_norm": 1.4140625, + "learning_rate": 6.538122925755549e-06, + "long_answer_loss": 0.0663, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 1749, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0682, + "grad_norm": 1.4375, + "learning_rate": 6.524522646606362e-06, + "long_answer_loss": 0.0682, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 1750, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0738, + "grad_norm": 1.484375, + "learning_rate": 6.5109315303237026e-06, + "long_answer_loss": 0.0738, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1751, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0788, + "grad_norm": 1.421875, + "learning_rate": 6.4973495977483475e-06, + "long_answer_loss": 0.0788, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 1752, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0776, + "grad_norm": 1.4375, + "learning_rate": 6.4837768697069755e-06, + "long_answer_loss": 0.0776, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 1753, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0822, + "grad_norm": 1.390625, + "learning_rate": 6.470213367012187e-06, + "long_answer_loss": 0.0822, + "loss": 0.0786, + "short_answer_loss": NaN, + "step": 1754, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0679, + "grad_norm": 1.4140625, + "learning_rate": 6.456659110462402e-06, + "long_answer_loss": 0.0679, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 1755, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0776, + "grad_norm": 1.4453125, + "learning_rate": 6.443114120841874e-06, + "long_answer_loss": 0.0776, + "loss": 0.0828, + "short_answer_loss": NaN, + "step": 1756, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0642, + "grad_norm": 1.453125, + "learning_rate": 6.429578418920653e-06, + "long_answer_loss": 0.0642, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 1757, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0631, + "grad_norm": 1.3203125, + "learning_rate": 6.41605202545454e-06, + "long_answer_loss": 0.0631, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 1758, + "template_loss": 0.0 + }, + { + "epoch": 1.34, + "full_loss": 0.0583, + "grad_norm": 1.4296875, + "learning_rate": 6.402534961185069e-06, + "long_answer_loss": 0.0583, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 1759, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0731, + "grad_norm": 1.4140625, + "learning_rate": 6.389027246839452e-06, + "long_answer_loss": 0.0731, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1760, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0752, + "grad_norm": 1.484375, + "learning_rate": 6.37552890313059e-06, + "long_answer_loss": 0.0752, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1761, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0836, + "grad_norm": 1.453125, + "learning_rate": 6.362039950756983e-06, + "long_answer_loss": 0.0836, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1762, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.074, + "grad_norm": 1.3046875, + "learning_rate": 6.348560410402768e-06, + "long_answer_loss": 0.074, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 1763, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0748, + "grad_norm": 1.3515625, + "learning_rate": 6.3350903027376135e-06, + "long_answer_loss": 0.0748, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 1764, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.08, + "grad_norm": 1.4296875, + "learning_rate": 6.321629648416743e-06, + "long_answer_loss": 0.08, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 1765, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0757, + "grad_norm": 1.5078125, + "learning_rate": 6.308178468080886e-06, + "long_answer_loss": 0.0757, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1766, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0829, + "grad_norm": 1.3671875, + "learning_rate": 6.294736782356231e-06, + "long_answer_loss": 0.0829, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1767, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0741, + "grad_norm": 1.40625, + "learning_rate": 6.281304611854427e-06, + "long_answer_loss": 0.0741, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1768, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0715, + "grad_norm": 1.4296875, + "learning_rate": 6.2678819771725015e-06, + "long_answer_loss": 0.0715, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 1769, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0707, + "grad_norm": 1.375, + "learning_rate": 6.2544688988929e-06, + "long_answer_loss": 0.0707, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1770, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0735, + "grad_norm": 1.421875, + "learning_rate": 6.241065397583374e-06, + "long_answer_loss": 0.0735, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 1771, + "template_loss": 0.0 + }, + { + "epoch": 1.35, + "full_loss": 0.0703, + "grad_norm": 1.484375, + "learning_rate": 6.227671493797027e-06, + "long_answer_loss": 0.0703, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 1772, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0697, + "grad_norm": 1.359375, + "learning_rate": 6.214287208072211e-06, + "long_answer_loss": 0.0697, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 1773, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0742, + "grad_norm": 1.4375, + "learning_rate": 6.200912560932554e-06, + "long_answer_loss": 0.0742, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1774, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0803, + "grad_norm": 1.4453125, + "learning_rate": 6.187547572886897e-06, + "long_answer_loss": 0.0803, + "loss": 0.0789, + "short_answer_loss": NaN, + "step": 1775, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0582, + "grad_norm": 1.296875, + "learning_rate": 6.174192264429256e-06, + "long_answer_loss": 0.0582, + "loss": 0.067, + "short_answer_loss": NaN, + "step": 1776, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0649, + "grad_norm": 1.3359375, + "learning_rate": 6.160846656038835e-06, + "long_answer_loss": 0.0649, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 1777, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0749, + "grad_norm": 1.4296875, + "learning_rate": 6.147510768179924e-06, + "long_answer_loss": 0.0749, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 1778, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0704, + "grad_norm": 1.40625, + "learning_rate": 6.134184621301952e-06, + "long_answer_loss": 0.0704, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1779, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.081, + "grad_norm": 1.390625, + "learning_rate": 6.120868235839369e-06, + "long_answer_loss": 0.081, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1780, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0757, + "grad_norm": 1.53125, + "learning_rate": 6.107561632211683e-06, + "long_answer_loss": 0.0757, + "loss": 0.078, + "short_answer_loss": NaN, + "step": 1781, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0741, + "grad_norm": 1.3359375, + "learning_rate": 6.094264830823395e-06, + "long_answer_loss": 0.0741, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 1782, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0723, + "grad_norm": 1.375, + "learning_rate": 6.0809778520639734e-06, + "long_answer_loss": 0.0723, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1783, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0763, + "grad_norm": 1.4140625, + "learning_rate": 6.067700716307827e-06, + "long_answer_loss": 0.0763, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 1784, + "template_loss": 0.0 + }, + { + "epoch": 1.36, + "full_loss": 0.0613, + "grad_norm": 1.484375, + "learning_rate": 6.05443344391427e-06, + "long_answer_loss": 0.0613, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 1785, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0635, + "grad_norm": 1.453125, + "learning_rate": 6.041176055227498e-06, + "long_answer_loss": 0.0635, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1786, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0855, + "grad_norm": 1.421875, + "learning_rate": 6.027928570576528e-06, + "long_answer_loss": 0.0855, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1787, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0857, + "grad_norm": 1.40625, + "learning_rate": 6.014691010275231e-06, + "long_answer_loss": 0.0857, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 1788, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0804, + "grad_norm": 1.4609375, + "learning_rate": 6.001463394622217e-06, + "long_answer_loss": 0.0804, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1789, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0486, + "grad_norm": 1.390625, + "learning_rate": 5.988245743900874e-06, + "long_answer_loss": 0.0486, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 1790, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0874, + "grad_norm": 1.34375, + "learning_rate": 5.975038078379299e-06, + "long_answer_loss": 0.0874, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 1791, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0758, + "grad_norm": 1.453125, + "learning_rate": 5.96184041831028e-06, + "long_answer_loss": 0.0758, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1792, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0746, + "grad_norm": 1.421875, + "learning_rate": 5.948652783931266e-06, + "long_answer_loss": 0.0746, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1793, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0804, + "grad_norm": 1.4453125, + "learning_rate": 5.935475195464326e-06, + "long_answer_loss": 0.0804, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 1794, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0702, + "grad_norm": 1.6796875, + "learning_rate": 5.922307673116132e-06, + "long_answer_loss": 0.0702, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 1795, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0683, + "grad_norm": 1.3828125, + "learning_rate": 5.909150237077908e-06, + "long_answer_loss": 0.0683, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1796, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0718, + "grad_norm": 1.390625, + "learning_rate": 5.896002907525424e-06, + "long_answer_loss": 0.0718, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 1797, + "template_loss": 0.0 + }, + { + "epoch": 1.37, + "full_loss": 0.0745, + "grad_norm": 1.5234375, + "learning_rate": 5.8828657046189474e-06, + "long_answer_loss": 0.0745, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 1798, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0631, + "grad_norm": 1.4375, + "learning_rate": 5.86973864850322e-06, + "long_answer_loss": 0.0631, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 1799, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0641, + "grad_norm": 1.3984375, + "learning_rate": 5.856621759307421e-06, + "long_answer_loss": 0.0641, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 1800, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0829, + "grad_norm": 1.3515625, + "learning_rate": 5.843515057145139e-06, + "long_answer_loss": 0.0829, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 1801, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0813, + "grad_norm": 1.4296875, + "learning_rate": 5.830418562114348e-06, + "long_answer_loss": 0.0813, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 1802, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0873, + "grad_norm": 1.453125, + "learning_rate": 5.8173322942973634e-06, + "long_answer_loss": 0.0873, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 1803, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0811, + "grad_norm": 1.5859375, + "learning_rate": 5.804256273760819e-06, + "long_answer_loss": 0.0811, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1804, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0725, + "grad_norm": 1.4921875, + "learning_rate": 5.791190520555645e-06, + "long_answer_loss": 0.0725, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 1805, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.088, + "grad_norm": 1.390625, + "learning_rate": 5.778135054717008e-06, + "long_answer_loss": 0.088, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 1806, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0912, + "grad_norm": 1.390625, + "learning_rate": 5.7650898962643165e-06, + "long_answer_loss": 0.0912, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 1807, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0726, + "grad_norm": 1.3828125, + "learning_rate": 5.752055065201167e-06, + "long_answer_loss": 0.0726, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 1808, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0637, + "grad_norm": 1.421875, + "learning_rate": 5.739030581515324e-06, + "long_answer_loss": 0.0637, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 1809, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.0958, + "grad_norm": 1.40625, + "learning_rate": 5.726016465178681e-06, + "long_answer_loss": 0.0958, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 1810, + "template_loss": 0.0 + }, + { + "epoch": 1.38, + "full_loss": 0.07, + "grad_norm": 1.3671875, + "learning_rate": 5.7130127361472345e-06, + "long_answer_loss": 0.07, + "loss": 0.0724, + "short_answer_loss": NaN, + "step": 1811, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0828, + "grad_norm": 1.40625, + "learning_rate": 5.700019414361059e-06, + "long_answer_loss": 0.0828, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 1812, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0673, + "grad_norm": 1.4140625, + "learning_rate": 5.687036519744251e-06, + "long_answer_loss": 0.0673, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 1813, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0633, + "grad_norm": 1.34375, + "learning_rate": 5.674064072204953e-06, + "long_answer_loss": 0.0633, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1814, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.084, + "grad_norm": 1.390625, + "learning_rate": 5.661102091635251e-06, + "long_answer_loss": 0.084, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1815, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0624, + "grad_norm": 1.3359375, + "learning_rate": 5.648150597911203e-06, + "long_answer_loss": 0.0624, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 1816, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0526, + "grad_norm": 1.3828125, + "learning_rate": 5.635209610892779e-06, + "long_answer_loss": 0.0526, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 1817, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0765, + "grad_norm": 1.3828125, + "learning_rate": 5.622279150423839e-06, + "long_answer_loss": 0.0765, + "loss": 0.0671, + "short_answer_loss": NaN, + "step": 1818, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0944, + "grad_norm": 1.375, + "learning_rate": 5.609359236332107e-06, + "long_answer_loss": 0.0944, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 1819, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0833, + "grad_norm": 1.4140625, + "learning_rate": 5.596449888429116e-06, + "long_answer_loss": 0.0833, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 1820, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0754, + "grad_norm": 1.390625, + "learning_rate": 5.5835511265102265e-06, + "long_answer_loss": 0.0754, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 1821, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.059, + "grad_norm": 1.390625, + "learning_rate": 5.570662970354536e-06, + "long_answer_loss": 0.059, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1822, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0896, + "grad_norm": 1.453125, + "learning_rate": 5.557785439724908e-06, + "long_answer_loss": 0.0896, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 1823, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0861, + "grad_norm": 1.4921875, + "learning_rate": 5.544918554367879e-06, + "long_answer_loss": 0.0861, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 1824, + "template_loss": 0.0 + }, + { + "epoch": 1.39, + "full_loss": 0.0703, + "grad_norm": 1.421875, + "learning_rate": 5.532062334013703e-06, + "long_answer_loss": 0.0703, + "loss": 0.0685, + "short_answer_loss": NaN, + "step": 1825, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0631, + "grad_norm": 1.4609375, + "learning_rate": 5.5192167983762425e-06, + "long_answer_loss": 0.0631, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1826, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0906, + "grad_norm": 1.453125, + "learning_rate": 5.5063819671529935e-06, + "long_answer_loss": 0.0906, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1827, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0906, + "grad_norm": 1.515625, + "learning_rate": 5.493557860025042e-06, + "long_answer_loss": 0.0906, + "loss": 0.0806, + "short_answer_loss": NaN, + "step": 1828, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0662, + "grad_norm": 1.53125, + "learning_rate": 5.48074449665701e-06, + "long_answer_loss": 0.0662, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1829, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.075, + "grad_norm": 1.46875, + "learning_rate": 5.467941896697075e-06, + "long_answer_loss": 0.075, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 1830, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0731, + "grad_norm": 1.4375, + "learning_rate": 5.455150079776876e-06, + "long_answer_loss": 0.0731, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1831, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0772, + "grad_norm": 1.46875, + "learning_rate": 5.442369065511552e-06, + "long_answer_loss": 0.0772, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1832, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0597, + "grad_norm": 1.5859375, + "learning_rate": 5.429598873499643e-06, + "long_answer_loss": 0.0597, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 1833, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0769, + "grad_norm": 1.4140625, + "learning_rate": 5.416839523323118e-06, + "long_answer_loss": 0.0769, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1834, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.088, + "grad_norm": 1.4609375, + "learning_rate": 5.404091034547311e-06, + "long_answer_loss": 0.088, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1835, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0951, + "grad_norm": 1.453125, + "learning_rate": 5.391353426720904e-06, + "long_answer_loss": 0.0951, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 1836, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0712, + "grad_norm": 1.5390625, + "learning_rate": 5.378626719375895e-06, + "long_answer_loss": 0.0712, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 1837, + "template_loss": 0.0 + }, + { + "epoch": 1.4, + "full_loss": 0.0658, + "grad_norm": 1.65625, + "learning_rate": 5.3659109320275565e-06, + "long_answer_loss": 0.0658, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 1838, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0735, + "grad_norm": 1.34375, + "learning_rate": 5.353206084174439e-06, + "long_answer_loss": 0.0735, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 1839, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.074, + "grad_norm": 1.359375, + "learning_rate": 5.340512195298291e-06, + "long_answer_loss": 0.074, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 1840, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.072, + "grad_norm": 1.3828125, + "learning_rate": 5.327829284864076e-06, + "long_answer_loss": 0.072, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 1841, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0797, + "grad_norm": 1.328125, + "learning_rate": 5.315157372319915e-06, + "long_answer_loss": 0.0797, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1842, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0609, + "grad_norm": 1.40625, + "learning_rate": 5.302496477097067e-06, + "long_answer_loss": 0.0609, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 1843, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.1022, + "grad_norm": 1.328125, + "learning_rate": 5.2898466186098934e-06, + "long_answer_loss": 0.1022, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1844, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0842, + "grad_norm": 1.4296875, + "learning_rate": 5.277207816255838e-06, + "long_answer_loss": 0.0842, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 1845, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0753, + "grad_norm": 1.3046875, + "learning_rate": 5.264580089415391e-06, + "long_answer_loss": 0.0753, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1846, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0705, + "grad_norm": 1.3515625, + "learning_rate": 5.25196345745204e-06, + "long_answer_loss": 0.0705, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1847, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0687, + "grad_norm": 1.4453125, + "learning_rate": 5.239357939712296e-06, + "long_answer_loss": 0.0687, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 1848, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0665, + "grad_norm": 1.3984375, + "learning_rate": 5.226763555525592e-06, + "long_answer_loss": 0.0665, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 1849, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0673, + "grad_norm": 1.390625, + "learning_rate": 5.214180324204307e-06, + "long_answer_loss": 0.0673, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 1850, + "template_loss": 0.0 + }, + { + "epoch": 1.41, + "full_loss": 0.0614, + "grad_norm": 1.3828125, + "learning_rate": 5.201608265043717e-06, + "long_answer_loss": 0.0614, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 1851, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0737, + "grad_norm": 1.375, + "learning_rate": 5.189047397321961e-06, + "long_answer_loss": 0.0737, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 1852, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0672, + "grad_norm": 1.5078125, + "learning_rate": 5.176497740300021e-06, + "long_answer_loss": 0.0672, + "loss": 0.0796, + "short_answer_loss": NaN, + "step": 1853, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0734, + "grad_norm": 1.421875, + "learning_rate": 5.1639593132216864e-06, + "long_answer_loss": 0.0734, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 1854, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0751, + "grad_norm": 1.296875, + "learning_rate": 5.151432135313529e-06, + "long_answer_loss": 0.0751, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1855, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0676, + "grad_norm": 1.4609375, + "learning_rate": 5.138916225784871e-06, + "long_answer_loss": 0.0676, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1856, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0746, + "grad_norm": 1.3671875, + "learning_rate": 5.126411603827748e-06, + "long_answer_loss": 0.0746, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1857, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0684, + "grad_norm": 1.3984375, + "learning_rate": 5.113918288616894e-06, + "long_answer_loss": 0.0684, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1858, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0682, + "grad_norm": 1.3671875, + "learning_rate": 5.101436299309706e-06, + "long_answer_loss": 0.0682, + "loss": 0.0659, + "short_answer_loss": NaN, + "step": 1859, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0737, + "grad_norm": 1.4140625, + "learning_rate": 5.088965655046213e-06, + "long_answer_loss": 0.0737, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 1860, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0797, + "grad_norm": 1.5234375, + "learning_rate": 5.076506374949043e-06, + "long_answer_loss": 0.0797, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1861, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0679, + "grad_norm": 1.453125, + "learning_rate": 5.0640584781234016e-06, + "long_answer_loss": 0.0679, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 1862, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0815, + "grad_norm": 1.3984375, + "learning_rate": 5.051621983657042e-06, + "long_answer_loss": 0.0815, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 1863, + "template_loss": 0.0 + }, + { + "epoch": 1.42, + "full_loss": 0.0682, + "grad_norm": 1.3125, + "learning_rate": 5.039196910620224e-06, + "long_answer_loss": 0.0682, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 1864, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0706, + "grad_norm": 1.421875, + "learning_rate": 5.026783278065708e-06, + "long_answer_loss": 0.0706, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 1865, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0842, + "grad_norm": 1.4453125, + "learning_rate": 5.01438110502869e-06, + "long_answer_loss": 0.0842, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 1866, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0686, + "grad_norm": 1.4609375, + "learning_rate": 5.00199041052682e-06, + "long_answer_loss": 0.0686, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 1867, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0551, + "grad_norm": 1.390625, + "learning_rate": 4.989611213560123e-06, + "long_answer_loss": 0.0551, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 1868, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0633, + "grad_norm": 1.4921875, + "learning_rate": 4.977243533111008e-06, + "long_answer_loss": 0.0633, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 1869, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0895, + "grad_norm": 1.3828125, + "learning_rate": 4.9648873881442185e-06, + "long_answer_loss": 0.0895, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 1870, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0688, + "grad_norm": 1.3671875, + "learning_rate": 4.9525427976068124e-06, + "long_answer_loss": 0.0688, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 1871, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0936, + "grad_norm": 1.46875, + "learning_rate": 4.940209780428133e-06, + "long_answer_loss": 0.0936, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1872, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0523, + "grad_norm": 1.484375, + "learning_rate": 4.927888355519758e-06, + "long_answer_loss": 0.0523, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 1873, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0758, + "grad_norm": 1.3828125, + "learning_rate": 4.915578541775523e-06, + "long_answer_loss": 0.0758, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 1874, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0731, + "grad_norm": 1.359375, + "learning_rate": 4.90328035807142e-06, + "long_answer_loss": 0.0731, + "loss": 0.0686, + "short_answer_loss": NaN, + "step": 1875, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.074, + "grad_norm": 1.3671875, + "learning_rate": 4.890993823265647e-06, + "long_answer_loss": 0.074, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 1876, + "template_loss": 0.0 + }, + { + "epoch": 1.43, + "full_loss": 0.0781, + "grad_norm": 1.3125, + "learning_rate": 4.878718956198504e-06, + "long_answer_loss": 0.0781, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 1877, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0788, + "grad_norm": 1.28125, + "learning_rate": 4.866455775692421e-06, + "long_answer_loss": 0.0788, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 1878, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0767, + "grad_norm": 1.359375, + "learning_rate": 4.854204300551901e-06, + "long_answer_loss": 0.0767, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 1879, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0585, + "grad_norm": 1.3984375, + "learning_rate": 4.841964549563499e-06, + "long_answer_loss": 0.0585, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 1880, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0577, + "grad_norm": 1.40625, + "learning_rate": 4.8297365414957955e-06, + "long_answer_loss": 0.0577, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1881, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0705, + "grad_norm": 1.390625, + "learning_rate": 4.817520295099348e-06, + "long_answer_loss": 0.0705, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 1882, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0583, + "grad_norm": 1.46875, + "learning_rate": 4.805315829106708e-06, + "long_answer_loss": 0.0583, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 1883, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0639, + "grad_norm": 1.421875, + "learning_rate": 4.793123162232328e-06, + "long_answer_loss": 0.0639, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 1884, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0731, + "grad_norm": 1.3828125, + "learning_rate": 4.780942313172602e-06, + "long_answer_loss": 0.0731, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1885, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0624, + "grad_norm": 1.4921875, + "learning_rate": 4.768773300605775e-06, + "long_answer_loss": 0.0624, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 1886, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0717, + "grad_norm": 1.5078125, + "learning_rate": 4.756616143191956e-06, + "long_answer_loss": 0.0717, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1887, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0768, + "grad_norm": 1.4140625, + "learning_rate": 4.744470859573075e-06, + "long_answer_loss": 0.0768, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1888, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0806, + "grad_norm": 1.34375, + "learning_rate": 4.732337468372838e-06, + "long_answer_loss": 0.0806, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1889, + "template_loss": 0.0 + }, + { + "epoch": 1.44, + "full_loss": 0.0893, + "grad_norm": 1.3671875, + "learning_rate": 4.720215988196746e-06, + "long_answer_loss": 0.0893, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1890, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0811, + "grad_norm": 1.3671875, + "learning_rate": 4.708106437632003e-06, + "long_answer_loss": 0.0811, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 1891, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0611, + "grad_norm": 1.453125, + "learning_rate": 4.6960088352475475e-06, + "long_answer_loss": 0.0611, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 1892, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0724, + "grad_norm": 1.3984375, + "learning_rate": 4.683923199593974e-06, + "long_answer_loss": 0.0724, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 1893, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0715, + "grad_norm": 1.3828125, + "learning_rate": 4.671849549203541e-06, + "long_answer_loss": 0.0715, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 1894, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0695, + "grad_norm": 1.375, + "learning_rate": 4.659787902590125e-06, + "long_answer_loss": 0.0695, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 1895, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0751, + "grad_norm": 1.4375, + "learning_rate": 4.647738278249193e-06, + "long_answer_loss": 0.0751, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 1896, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0703, + "grad_norm": 1.5390625, + "learning_rate": 4.635700694657781e-06, + "long_answer_loss": 0.0703, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 1897, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.074, + "grad_norm": 1.5234375, + "learning_rate": 4.62367517027446e-06, + "long_answer_loss": 0.074, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 1898, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0738, + "grad_norm": 1.421875, + "learning_rate": 4.6116617235393105e-06, + "long_answer_loss": 0.0738, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 1899, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0733, + "grad_norm": 1.3828125, + "learning_rate": 4.599660372873883e-06, + "long_answer_loss": 0.0733, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1900, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.061, + "grad_norm": 1.3984375, + "learning_rate": 4.587671136681203e-06, + "long_answer_loss": 0.061, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 1901, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0717, + "grad_norm": 1.4140625, + "learning_rate": 4.575694033345691e-06, + "long_answer_loss": 0.0717, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 1902, + "template_loss": 0.0 + }, + { + "epoch": 1.45, + "full_loss": 0.0633, + "grad_norm": 1.390625, + "learning_rate": 4.563729081233184e-06, + "long_answer_loss": 0.0633, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 1903, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0738, + "grad_norm": 1.4375, + "learning_rate": 4.551776298690875e-06, + "long_answer_loss": 0.0738, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 1904, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0762, + "grad_norm": 1.4296875, + "learning_rate": 4.539835704047304e-06, + "long_answer_loss": 0.0762, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 1905, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0719, + "grad_norm": 1.390625, + "learning_rate": 4.527907315612315e-06, + "long_answer_loss": 0.0719, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 1906, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0759, + "grad_norm": 1.40625, + "learning_rate": 4.515991151677038e-06, + "long_answer_loss": 0.0759, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 1907, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0911, + "grad_norm": 1.34375, + "learning_rate": 4.504087230513862e-06, + "long_answer_loss": 0.0911, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 1908, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0789, + "grad_norm": 1.34375, + "learning_rate": 4.492195570376391e-06, + "long_answer_loss": 0.0789, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 1909, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0783, + "grad_norm": 1.453125, + "learning_rate": 4.480316189499436e-06, + "long_answer_loss": 0.0783, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 1910, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.084, + "grad_norm": 1.375, + "learning_rate": 4.468449106098983e-06, + "long_answer_loss": 0.084, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 1911, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0628, + "grad_norm": 1.4296875, + "learning_rate": 4.456594338372151e-06, + "long_answer_loss": 0.0628, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 1912, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0656, + "grad_norm": 1.390625, + "learning_rate": 4.4447519044971815e-06, + "long_answer_loss": 0.0656, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1913, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0656, + "grad_norm": 1.4609375, + "learning_rate": 4.4329218226333995e-06, + "long_answer_loss": 0.0656, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 1914, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0662, + "grad_norm": 1.4609375, + "learning_rate": 4.421104110921191e-06, + "long_answer_loss": 0.0662, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 1915, + "template_loss": 0.0 + }, + { + "epoch": 1.46, + "full_loss": 0.0976, + "grad_norm": 1.4765625, + "learning_rate": 4.4092987874819704e-06, + "long_answer_loss": 0.0976, + "loss": 0.0774, + "short_answer_loss": NaN, + "step": 1916, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0638, + "grad_norm": 1.3984375, + "learning_rate": 4.397505870418162e-06, + "long_answer_loss": 0.0638, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1917, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.088, + "grad_norm": 1.328125, + "learning_rate": 4.385725377813163e-06, + "long_answer_loss": 0.088, + "loss": 0.0771, + "short_answer_loss": NaN, + "step": 1918, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0714, + "grad_norm": 1.4375, + "learning_rate": 4.3739573277313095e-06, + "long_answer_loss": 0.0714, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 1919, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0754, + "grad_norm": 1.3828125, + "learning_rate": 4.3622017382178735e-06, + "long_answer_loss": 0.0754, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 1920, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0719, + "grad_norm": 1.4375, + "learning_rate": 4.35045862729901e-06, + "long_answer_loss": 0.0719, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 1921, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0689, + "grad_norm": 1.3984375, + "learning_rate": 4.338728012981743e-06, + "long_answer_loss": 0.0689, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 1922, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.066, + "grad_norm": 1.375, + "learning_rate": 4.327009913253934e-06, + "long_answer_loss": 0.066, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 1923, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.054, + "grad_norm": 1.484375, + "learning_rate": 4.3153043460842504e-06, + "long_answer_loss": 0.054, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 1924, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0484, + "grad_norm": 1.4921875, + "learning_rate": 4.303611329422154e-06, + "long_answer_loss": 0.0484, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 1925, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0765, + "grad_norm": 1.34375, + "learning_rate": 4.2919308811978364e-06, + "long_answer_loss": 0.0765, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 1926, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0781, + "grad_norm": 1.359375, + "learning_rate": 4.28026301932225e-06, + "long_answer_loss": 0.0781, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1927, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0638, + "grad_norm": 1.4296875, + "learning_rate": 4.268607761687019e-06, + "long_answer_loss": 0.0638, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1928, + "template_loss": 0.0 + }, + { + "epoch": 1.47, + "full_loss": 0.0806, + "grad_norm": 1.390625, + "learning_rate": 4.256965126164454e-06, + "long_answer_loss": 0.0806, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 1929, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.056, + "grad_norm": 1.359375, + "learning_rate": 4.245335130607508e-06, + "long_answer_loss": 0.056, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 1930, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0765, + "grad_norm": 1.359375, + "learning_rate": 4.233717792849754e-06, + "long_answer_loss": 0.0765, + "loss": 0.0677, + "short_answer_loss": NaN, + "step": 1931, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0683, + "grad_norm": 1.3984375, + "learning_rate": 4.222113130705352e-06, + "long_answer_loss": 0.0683, + "loss": 0.0683, + "short_answer_loss": NaN, + "step": 1932, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0865, + "grad_norm": 1.4921875, + "learning_rate": 4.210521161969018e-06, + "long_answer_loss": 0.0865, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 1933, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0747, + "grad_norm": 1.3359375, + "learning_rate": 4.198941904416027e-06, + "long_answer_loss": 0.0747, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 1934, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0633, + "grad_norm": 1.3984375, + "learning_rate": 4.18737537580213e-06, + "long_answer_loss": 0.0633, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 1935, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0704, + "grad_norm": 1.3671875, + "learning_rate": 4.175821593863595e-06, + "long_answer_loss": 0.0704, + "loss": 0.0677, + "short_answer_loss": NaN, + "step": 1936, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0515, + "grad_norm": 1.3515625, + "learning_rate": 4.164280576317106e-06, + "long_answer_loss": 0.0515, + "loss": 0.0686, + "short_answer_loss": NaN, + "step": 1937, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0735, + "grad_norm": 1.3671875, + "learning_rate": 4.152752340859814e-06, + "long_answer_loss": 0.0735, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1938, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0694, + "grad_norm": 1.421875, + "learning_rate": 4.1412369051692336e-06, + "long_answer_loss": 0.0694, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 1939, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0722, + "grad_norm": 1.4140625, + "learning_rate": 4.129734286903275e-06, + "long_answer_loss": 0.0722, + "loss": 0.069, + "short_answer_loss": NaN, + "step": 1940, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.055, + "grad_norm": 1.4453125, + "learning_rate": 4.118244503700189e-06, + "long_answer_loss": 0.055, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1941, + "template_loss": 0.0 + }, + { + "epoch": 1.48, + "full_loss": 0.0641, + "grad_norm": 1.3515625, + "learning_rate": 4.106767573178531e-06, + "long_answer_loss": 0.0641, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 1942, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0684, + "grad_norm": 1.3359375, + "learning_rate": 4.095303512937176e-06, + "long_answer_loss": 0.0684, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 1943, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0839, + "grad_norm": 1.4609375, + "learning_rate": 4.083852340555233e-06, + "long_answer_loss": 0.0839, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1944, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0574, + "grad_norm": 1.34375, + "learning_rate": 4.072414073592076e-06, + "long_answer_loss": 0.0574, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 1945, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0798, + "grad_norm": 1.4296875, + "learning_rate": 4.060988729587267e-06, + "long_answer_loss": 0.0798, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 1946, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0727, + "grad_norm": 1.515625, + "learning_rate": 4.0495763260605654e-06, + "long_answer_loss": 0.0727, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 1947, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0754, + "grad_norm": 1.453125, + "learning_rate": 4.038176880511883e-06, + "long_answer_loss": 0.0754, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 1948, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0821, + "grad_norm": 1.3828125, + "learning_rate": 4.026790410421262e-06, + "long_answer_loss": 0.0821, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1949, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0685, + "grad_norm": 1.4296875, + "learning_rate": 4.015416933248853e-06, + "long_answer_loss": 0.0685, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 1950, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0678, + "grad_norm": 1.328125, + "learning_rate": 4.0040564664348665e-06, + "long_answer_loss": 0.0678, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 1951, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0633, + "grad_norm": 1.359375, + "learning_rate": 3.992709027399588e-06, + "long_answer_loss": 0.0633, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 1952, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0734, + "grad_norm": 1.3125, + "learning_rate": 3.9813746335433025e-06, + "long_answer_loss": 0.0734, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 1953, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0726, + "grad_norm": 1.4453125, + "learning_rate": 3.970053302246307e-06, + "long_answer_loss": 0.0726, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 1954, + "template_loss": 0.0 + }, + { + "epoch": 1.49, + "full_loss": 0.0718, + "grad_norm": 1.34375, + "learning_rate": 3.958745050868861e-06, + "long_answer_loss": 0.0718, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 1955, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0936, + "grad_norm": 1.4765625, + "learning_rate": 3.947449896751167e-06, + "long_answer_loss": 0.0936, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 1956, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0763, + "grad_norm": 1.390625, + "learning_rate": 3.936167857213349e-06, + "long_answer_loss": 0.0763, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 1957, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0744, + "grad_norm": 1.390625, + "learning_rate": 3.924898949555415e-06, + "long_answer_loss": 0.0744, + "loss": 0.0789, + "short_answer_loss": NaN, + "step": 1958, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0628, + "grad_norm": 1.3671875, + "learning_rate": 3.9136431910572465e-06, + "long_answer_loss": 0.0628, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 1959, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.06, + "grad_norm": 1.3515625, + "learning_rate": 3.90240059897854e-06, + "long_answer_loss": 0.06, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 1960, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0645, + "grad_norm": 1.453125, + "learning_rate": 3.891171190558833e-06, + "long_answer_loss": 0.0645, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 1961, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0723, + "grad_norm": 1.3359375, + "learning_rate": 3.879954983017421e-06, + "long_answer_loss": 0.0723, + "loss": 0.0686, + "short_answer_loss": NaN, + "step": 1962, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.08, + "grad_norm": 1.3828125, + "learning_rate": 3.868751993553368e-06, + "long_answer_loss": 0.08, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 1963, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0609, + "grad_norm": 1.3984375, + "learning_rate": 3.8575622393454735e-06, + "long_answer_loss": 0.0609, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 1964, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0603, + "grad_norm": 1.3359375, + "learning_rate": 3.846385737552231e-06, + "long_answer_loss": 0.0603, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 1965, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0765, + "grad_norm": 1.3984375, + "learning_rate": 3.835222505311822e-06, + "long_answer_loss": 0.0765, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1966, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0832, + "grad_norm": 1.4453125, + "learning_rate": 3.824072559742076e-06, + "long_answer_loss": 0.0832, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1967, + "template_loss": 0.0 + }, + { + "epoch": 1.5, + "full_loss": 0.0736, + "grad_norm": 1.53125, + "learning_rate": 3.8129359179404494e-06, + "long_answer_loss": 0.0736, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 1968, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0815, + "grad_norm": 1.5078125, + "learning_rate": 3.801812596984003e-06, + "long_answer_loss": 0.0815, + "loss": 0.0765, + "short_answer_loss": NaN, + "step": 1969, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0847, + "grad_norm": 1.40625, + "learning_rate": 3.790702613929356e-06, + "long_answer_loss": 0.0847, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 1970, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0691, + "grad_norm": 1.453125, + "learning_rate": 3.7796059858126927e-06, + "long_answer_loss": 0.0691, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 1971, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0592, + "grad_norm": 1.3828125, + "learning_rate": 3.768522729649711e-06, + "long_answer_loss": 0.0592, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 1972, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0794, + "grad_norm": 1.3828125, + "learning_rate": 3.7574528624356036e-06, + "long_answer_loss": 0.0794, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 1973, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0508, + "grad_norm": 1.328125, + "learning_rate": 3.746396401145036e-06, + "long_answer_loss": 0.0508, + "loss": 0.0671, + "short_answer_loss": NaN, + "step": 1974, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0747, + "grad_norm": 1.4375, + "learning_rate": 3.735353362732112e-06, + "long_answer_loss": 0.0747, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 1975, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0796, + "grad_norm": 1.4453125, + "learning_rate": 3.724323764130358e-06, + "long_answer_loss": 0.0796, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 1976, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.07, + "grad_norm": 1.34375, + "learning_rate": 3.713307622252686e-06, + "long_answer_loss": 0.07, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 1977, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0648, + "grad_norm": 1.4609375, + "learning_rate": 3.702304953991383e-06, + "long_answer_loss": 0.0648, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 1978, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0669, + "grad_norm": 1.40625, + "learning_rate": 3.6913157762180544e-06, + "long_answer_loss": 0.0669, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 1979, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.07, + "grad_norm": 1.4296875, + "learning_rate": 3.68034010578365e-06, + "long_answer_loss": 0.07, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 1980, + "template_loss": 0.0 + }, + { + "epoch": 1.51, + "full_loss": 0.0745, + "grad_norm": 1.3515625, + "learning_rate": 3.669377959518379e-06, + "long_answer_loss": 0.0745, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 1981, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0661, + "grad_norm": 1.3828125, + "learning_rate": 3.658429354231728e-06, + "long_answer_loss": 0.0661, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 1982, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0659, + "grad_norm": 1.484375, + "learning_rate": 3.647494306712415e-06, + "long_answer_loss": 0.0659, + "loss": 0.076, + "short_answer_loss": NaN, + "step": 1983, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0723, + "grad_norm": 1.3671875, + "learning_rate": 3.63657283372837e-06, + "long_answer_loss": 0.0723, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 1984, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0747, + "grad_norm": 1.3671875, + "learning_rate": 3.625664952026711e-06, + "long_answer_loss": 0.0747, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 1985, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0923, + "grad_norm": 1.4375, + "learning_rate": 3.614770678333698e-06, + "long_answer_loss": 0.0923, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 1986, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0648, + "grad_norm": 1.359375, + "learning_rate": 3.6038900293547536e-06, + "long_answer_loss": 0.0648, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 1987, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0694, + "grad_norm": 1.484375, + "learning_rate": 3.593023021774375e-06, + "long_answer_loss": 0.0694, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 1988, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0708, + "grad_norm": 1.421875, + "learning_rate": 3.5821696722561735e-06, + "long_answer_loss": 0.0708, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 1989, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0831, + "grad_norm": 1.3515625, + "learning_rate": 3.571329997442792e-06, + "long_answer_loss": 0.0831, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 1990, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0713, + "grad_norm": 1.359375, + "learning_rate": 3.560504013955916e-06, + "long_answer_loss": 0.0713, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 1991, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0799, + "grad_norm": 1.3671875, + "learning_rate": 3.549691738396235e-06, + "long_answer_loss": 0.0799, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 1992, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0857, + "grad_norm": 1.421875, + "learning_rate": 3.5388931873434186e-06, + "long_answer_loss": 0.0857, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 1993, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0732, + "grad_norm": 1.3671875, + "learning_rate": 3.528108377356093e-06, + "long_answer_loss": 0.0732, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 1994, + "template_loss": 0.0 + }, + { + "epoch": 1.52, + "full_loss": 0.0873, + "grad_norm": 1.375, + "learning_rate": 3.5173373249718035e-06, + "long_answer_loss": 0.0873, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 1995, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.08, + "grad_norm": 1.3359375, + "learning_rate": 3.5065800467070182e-06, + "long_answer_loss": 0.08, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 1996, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0824, + "grad_norm": 1.40625, + "learning_rate": 3.4958365590570597e-06, + "long_answer_loss": 0.0824, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 1997, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0983, + "grad_norm": 1.4140625, + "learning_rate": 3.485106878496133e-06, + "long_answer_loss": 0.0983, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 1998, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0669, + "grad_norm": 1.34375, + "learning_rate": 3.4743910214772413e-06, + "long_answer_loss": 0.0669, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 1999, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0593, + "grad_norm": 1.484375, + "learning_rate": 3.4636890044322107e-06, + "long_answer_loss": 0.0593, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2000, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0589, + "grad_norm": 1.3828125, + "learning_rate": 3.453000843771642e-06, + "long_answer_loss": 0.0589, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2001, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0815, + "grad_norm": 1.453125, + "learning_rate": 3.442326555884873e-06, + "long_answer_loss": 0.0815, + "loss": 0.081, + "short_answer_loss": NaN, + "step": 2002, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0766, + "grad_norm": 1.3671875, + "learning_rate": 3.4316661571399955e-06, + "long_answer_loss": 0.0766, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 2003, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0675, + "grad_norm": 1.34375, + "learning_rate": 3.4210196638837745e-06, + "long_answer_loss": 0.0675, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2004, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0649, + "grad_norm": 1.5625, + "learning_rate": 3.410387092441683e-06, + "long_answer_loss": 0.0649, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2005, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0683, + "grad_norm": 1.4375, + "learning_rate": 3.3997684591178177e-06, + "long_answer_loss": 0.0683, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2006, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.0721, + "grad_norm": 1.390625, + "learning_rate": 3.389163780194918e-06, + "long_answer_loss": 0.0721, + "loss": 0.0669, + "short_answer_loss": NaN, + "step": 2007, + "template_loss": 0.0 + }, + { + "epoch": 1.53, + "full_loss": 0.059, + "grad_norm": 1.3203125, + "learning_rate": 3.3785730719343226e-06, + "long_answer_loss": 0.059, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 2008, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0737, + "grad_norm": 1.421875, + "learning_rate": 3.367996350575946e-06, + "long_answer_loss": 0.0737, + "loss": 0.0765, + "short_answer_loss": NaN, + "step": 2009, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0627, + "grad_norm": 1.421875, + "learning_rate": 3.3574336323382595e-06, + "long_answer_loss": 0.0627, + "loss": 0.0648, + "short_answer_loss": NaN, + "step": 2010, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0658, + "grad_norm": 1.4296875, + "learning_rate": 3.3468849334182483e-06, + "long_answer_loss": 0.0658, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2011, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0719, + "grad_norm": 1.484375, + "learning_rate": 3.3363502699914244e-06, + "long_answer_loss": 0.0719, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 2012, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0886, + "grad_norm": 1.4609375, + "learning_rate": 3.3258296582117474e-06, + "long_answer_loss": 0.0886, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2013, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0781, + "grad_norm": 1.359375, + "learning_rate": 3.3153231142116617e-06, + "long_answer_loss": 0.0781, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2014, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0944, + "grad_norm": 1.3671875, + "learning_rate": 3.3048306541020117e-06, + "long_answer_loss": 0.0944, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2015, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.067, + "grad_norm": 1.4375, + "learning_rate": 3.2943522939720637e-06, + "long_answer_loss": 0.067, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2016, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0634, + "grad_norm": 1.421875, + "learning_rate": 3.2838880498894568e-06, + "long_answer_loss": 0.0634, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2017, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0758, + "grad_norm": 1.46875, + "learning_rate": 3.273437937900184e-06, + "long_answer_loss": 0.0758, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 2018, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0712, + "grad_norm": 1.3828125, + "learning_rate": 3.263001974028568e-06, + "long_answer_loss": 0.0712, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2019, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0595, + "grad_norm": 1.390625, + "learning_rate": 3.252580174277238e-06, + "long_answer_loss": 0.0595, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 2020, + "template_loss": 0.0 + }, + { + "epoch": 1.54, + "full_loss": 0.0628, + "grad_norm": 1.3046875, + "learning_rate": 3.242172554627107e-06, + "long_answer_loss": 0.0628, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2021, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0682, + "grad_norm": 1.4921875, + "learning_rate": 3.231779131037331e-06, + "long_answer_loss": 0.0682, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 2022, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0698, + "grad_norm": 1.4609375, + "learning_rate": 3.2213999194453128e-06, + "long_answer_loss": 0.0698, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2023, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0694, + "grad_norm": 1.4453125, + "learning_rate": 3.211034935766656e-06, + "long_answer_loss": 0.0694, + "loss": 0.076, + "short_answer_loss": NaN, + "step": 2024, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0882, + "grad_norm": 1.3671875, + "learning_rate": 3.2006841958951458e-06, + "long_answer_loss": 0.0882, + "loss": 0.0679, + "short_answer_loss": NaN, + "step": 2025, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0866, + "grad_norm": 1.4375, + "learning_rate": 3.1903477157027266e-06, + "long_answer_loss": 0.0866, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2026, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0714, + "grad_norm": 1.3125, + "learning_rate": 3.1800255110394806e-06, + "long_answer_loss": 0.0714, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2027, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0811, + "grad_norm": 1.3359375, + "learning_rate": 3.1697175977335946e-06, + "long_answer_loss": 0.0811, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2028, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0636, + "grad_norm": 1.34375, + "learning_rate": 3.1594239915913413e-06, + "long_answer_loss": 0.0636, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2029, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0601, + "grad_norm": 1.375, + "learning_rate": 3.1491447083970586e-06, + "long_answer_loss": 0.0601, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 2030, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0741, + "grad_norm": 1.3828125, + "learning_rate": 3.138879763913122e-06, + "long_answer_loss": 0.0741, + "loss": 0.0681, + "short_answer_loss": NaN, + "step": 2031, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0805, + "grad_norm": 1.359375, + "learning_rate": 3.1286291738799087e-06, + "long_answer_loss": 0.0805, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 2032, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0692, + "grad_norm": 1.3359375, + "learning_rate": 3.1183929540157973e-06, + "long_answer_loss": 0.0692, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2033, + "template_loss": 0.0 + }, + { + "epoch": 1.55, + "full_loss": 0.0677, + "grad_norm": 1.3984375, + "learning_rate": 3.1081711200171266e-06, + "long_answer_loss": 0.0677, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2034, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0674, + "grad_norm": 1.484375, + "learning_rate": 3.097963687558175e-06, + "long_answer_loss": 0.0674, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2035, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0793, + "grad_norm": 1.4453125, + "learning_rate": 3.087770672291139e-06, + "long_answer_loss": 0.0793, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 2036, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0796, + "grad_norm": 1.390625, + "learning_rate": 3.077592089846107e-06, + "long_answer_loss": 0.0796, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2037, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0851, + "grad_norm": 1.4296875, + "learning_rate": 3.0674279558310384e-06, + "long_answer_loss": 0.0851, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 2038, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0831, + "grad_norm": 1.453125, + "learning_rate": 3.0572782858317244e-06, + "long_answer_loss": 0.0831, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 2039, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.061, + "grad_norm": 1.3515625, + "learning_rate": 3.0471430954118018e-06, + "long_answer_loss": 0.061, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2040, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0708, + "grad_norm": 1.4296875, + "learning_rate": 3.037022400112678e-06, + "long_answer_loss": 0.0708, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 2041, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0673, + "grad_norm": 1.4765625, + "learning_rate": 3.02691621545355e-06, + "long_answer_loss": 0.0673, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2042, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0743, + "grad_norm": 1.3515625, + "learning_rate": 3.0168245569313566e-06, + "long_answer_loss": 0.0743, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2043, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0795, + "grad_norm": 1.4453125, + "learning_rate": 3.0067474400207672e-06, + "long_answer_loss": 0.0795, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2044, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0606, + "grad_norm": 1.375, + "learning_rate": 2.996684880174151e-06, + "long_answer_loss": 0.0606, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2045, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0719, + "grad_norm": 1.4453125, + "learning_rate": 2.9866368928215456e-06, + "long_answer_loss": 0.0719, + "loss": 0.0724, + "short_answer_loss": NaN, + "step": 2046, + "template_loss": 0.0 + }, + { + "epoch": 1.56, + "full_loss": 0.0654, + "grad_norm": 1.3671875, + "learning_rate": 2.9766034933706653e-06, + "long_answer_loss": 0.0654, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 2047, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0689, + "grad_norm": 1.359375, + "learning_rate": 2.9665846972068285e-06, + "long_answer_loss": 0.0689, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 2048, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0772, + "grad_norm": 1.3203125, + "learning_rate": 2.9565805196929864e-06, + "long_answer_loss": 0.0772, + "loss": 0.067, + "short_answer_loss": NaN, + "step": 2049, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0725, + "grad_norm": 1.4921875, + "learning_rate": 2.946590976169651e-06, + "long_answer_loss": 0.0725, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2050, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0687, + "grad_norm": 1.3515625, + "learning_rate": 2.9366160819549087e-06, + "long_answer_loss": 0.0687, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2051, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0858, + "grad_norm": 1.390625, + "learning_rate": 2.9266558523443776e-06, + "long_answer_loss": 0.0858, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2052, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0876, + "grad_norm": 1.3671875, + "learning_rate": 2.9167103026111904e-06, + "long_answer_loss": 0.0876, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2053, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0657, + "grad_norm": 1.40625, + "learning_rate": 2.9067794480059735e-06, + "long_answer_loss": 0.0657, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 2054, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0597, + "grad_norm": 1.4375, + "learning_rate": 2.896863303756801e-06, + "long_answer_loss": 0.0597, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2055, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0737, + "grad_norm": 1.4140625, + "learning_rate": 2.8869618850692227e-06, + "long_answer_loss": 0.0737, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2056, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.085, + "grad_norm": 1.4140625, + "learning_rate": 2.87707520712617e-06, + "long_answer_loss": 0.085, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 2057, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0932, + "grad_norm": 1.5390625, + "learning_rate": 2.8672032850880078e-06, + "long_answer_loss": 0.0932, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2058, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0772, + "grad_norm": 1.4296875, + "learning_rate": 2.857346134092445e-06, + "long_answer_loss": 0.0772, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2059, + "template_loss": 0.0 + }, + { + "epoch": 1.57, + "full_loss": 0.0631, + "grad_norm": 1.3671875, + "learning_rate": 2.847503769254553e-06, + "long_answer_loss": 0.0631, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2060, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0792, + "grad_norm": 1.40625, + "learning_rate": 2.837676205666731e-06, + "long_answer_loss": 0.0792, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2061, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.075, + "grad_norm": 1.421875, + "learning_rate": 2.82786345839868e-06, + "long_answer_loss": 0.075, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 2062, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0733, + "grad_norm": 1.5, + "learning_rate": 2.8180655424973806e-06, + "long_answer_loss": 0.0733, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2063, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0627, + "grad_norm": 1.53125, + "learning_rate": 2.8082824729870642e-06, + "long_answer_loss": 0.0627, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2064, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0622, + "grad_norm": 1.3046875, + "learning_rate": 2.7985142648692176e-06, + "long_answer_loss": 0.0622, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 2065, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0718, + "grad_norm": 1.453125, + "learning_rate": 2.7887609331225114e-06, + "long_answer_loss": 0.0718, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2066, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0733, + "grad_norm": 1.546875, + "learning_rate": 2.7790224927028237e-06, + "long_answer_loss": 0.0733, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2067, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0648, + "grad_norm": 1.3671875, + "learning_rate": 2.76929895854319e-06, + "long_answer_loss": 0.0648, + "loss": 0.0714, + "short_answer_loss": NaN, + "step": 2068, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0946, + "grad_norm": 1.46875, + "learning_rate": 2.7595903455537946e-06, + "long_answer_loss": 0.0946, + "loss": 0.0785, + "short_answer_loss": NaN, + "step": 2069, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0813, + "grad_norm": 1.4609375, + "learning_rate": 2.7498966686219347e-06, + "long_answer_loss": 0.0813, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 2070, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0739, + "grad_norm": 1.53125, + "learning_rate": 2.7402179426120085e-06, + "long_answer_loss": 0.0739, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2071, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0763, + "grad_norm": 1.3828125, + "learning_rate": 2.730554182365491e-06, + "long_answer_loss": 0.0763, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2072, + "template_loss": 0.0 + }, + { + "epoch": 1.58, + "full_loss": 0.0803, + "grad_norm": 1.5234375, + "learning_rate": 2.720905402700892e-06, + "long_answer_loss": 0.0803, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 2073, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.074, + "grad_norm": 1.46875, + "learning_rate": 2.7112716184137798e-06, + "long_answer_loss": 0.074, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2074, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0824, + "grad_norm": 1.4296875, + "learning_rate": 2.7016528442766977e-06, + "long_answer_loss": 0.0824, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2075, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0619, + "grad_norm": 1.421875, + "learning_rate": 2.692049095039191e-06, + "long_answer_loss": 0.0619, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 2076, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0748, + "grad_norm": 1.4296875, + "learning_rate": 2.682460385427761e-06, + "long_answer_loss": 0.0748, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 2077, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.08, + "grad_norm": 1.3828125, + "learning_rate": 2.672886730145846e-06, + "long_answer_loss": 0.08, + "loss": 0.0671, + "short_answer_loss": NaN, + "step": 2078, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0673, + "grad_norm": 1.296875, + "learning_rate": 2.6633281438738e-06, + "long_answer_loss": 0.0673, + "loss": 0.0667, + "short_answer_loss": NaN, + "step": 2079, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0622, + "grad_norm": 1.421875, + "learning_rate": 2.6537846412688707e-06, + "long_answer_loss": 0.0622, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2080, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0714, + "grad_norm": 1.3515625, + "learning_rate": 2.644256236965177e-06, + "long_answer_loss": 0.0714, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 2081, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0814, + "grad_norm": 1.4765625, + "learning_rate": 2.634742945573687e-06, + "long_answer_loss": 0.0814, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 2082, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.095, + "grad_norm": 1.5390625, + "learning_rate": 2.625244781682187e-06, + "long_answer_loss": 0.095, + "loss": 0.0764, + "short_answer_loss": NaN, + "step": 2083, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0839, + "grad_norm": 1.453125, + "learning_rate": 2.6157617598552745e-06, + "long_answer_loss": 0.0839, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2084, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.089, + "grad_norm": 1.3984375, + "learning_rate": 2.6062938946343248e-06, + "long_answer_loss": 0.089, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2085, + "template_loss": 0.0 + }, + { + "epoch": 1.59, + "full_loss": 0.0586, + "grad_norm": 1.4609375, + "learning_rate": 2.596841200537474e-06, + "long_answer_loss": 0.0586, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2086, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0648, + "grad_norm": 1.4453125, + "learning_rate": 2.5874036920595937e-06, + "long_answer_loss": 0.0648, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2087, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0535, + "grad_norm": 1.4375, + "learning_rate": 2.5779813836722677e-06, + "long_answer_loss": 0.0535, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2088, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0799, + "grad_norm": 1.3984375, + "learning_rate": 2.5685742898237748e-06, + "long_answer_loss": 0.0799, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 2089, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.071, + "grad_norm": 1.46875, + "learning_rate": 2.5591824249390607e-06, + "long_answer_loss": 0.071, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2090, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0737, + "grad_norm": 1.4140625, + "learning_rate": 2.549805803419725e-06, + "long_answer_loss": 0.0737, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2091, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0818, + "grad_norm": 1.3515625, + "learning_rate": 2.540444439643977e-06, + "long_answer_loss": 0.0818, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2092, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0694, + "grad_norm": 1.3359375, + "learning_rate": 2.5310983479666554e-06, + "long_answer_loss": 0.0694, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 2093, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0728, + "grad_norm": 1.40625, + "learning_rate": 2.5217675427191555e-06, + "long_answer_loss": 0.0728, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 2094, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.07, + "grad_norm": 1.375, + "learning_rate": 2.5124520382094466e-06, + "long_answer_loss": 0.07, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2095, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0737, + "grad_norm": 1.4609375, + "learning_rate": 2.5031518487220294e-06, + "long_answer_loss": 0.0737, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2096, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0711, + "grad_norm": 1.3828125, + "learning_rate": 2.493866988517926e-06, + "long_answer_loss": 0.0711, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2097, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.074, + "grad_norm": 1.3203125, + "learning_rate": 2.4845974718346503e-06, + "long_answer_loss": 0.074, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 2098, + "template_loss": 0.0 + }, + { + "epoch": 1.6, + "full_loss": 0.0747, + "grad_norm": 1.46875, + "learning_rate": 2.4753433128861775e-06, + "long_answer_loss": 0.0747, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2099, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0688, + "grad_norm": 1.4609375, + "learning_rate": 2.466104525862957e-06, + "long_answer_loss": 0.0688, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2100, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0772, + "grad_norm": 1.3828125, + "learning_rate": 2.456881124931837e-06, + "long_answer_loss": 0.0772, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2101, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0615, + "grad_norm": 1.390625, + "learning_rate": 2.447673124236102e-06, + "long_answer_loss": 0.0615, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2102, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0754, + "grad_norm": 1.4140625, + "learning_rate": 2.438480537895399e-06, + "long_answer_loss": 0.0754, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 2103, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0663, + "grad_norm": 1.3828125, + "learning_rate": 2.4293033800057486e-06, + "long_answer_loss": 0.0663, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2104, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0951, + "grad_norm": 1.4609375, + "learning_rate": 2.4201416646395123e-06, + "long_answer_loss": 0.0951, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2105, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0688, + "grad_norm": 1.421875, + "learning_rate": 2.410995405845369e-06, + "long_answer_loss": 0.0688, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2106, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0581, + "grad_norm": 1.4609375, + "learning_rate": 2.4018646176483056e-06, + "long_answer_loss": 0.0581, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2107, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0743, + "grad_norm": 1.4765625, + "learning_rate": 2.3927493140495653e-06, + "long_answer_loss": 0.0743, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2108, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.067, + "grad_norm": 1.390625, + "learning_rate": 2.3836495090266767e-06, + "long_answer_loss": 0.067, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2109, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0935, + "grad_norm": 1.4375, + "learning_rate": 2.3745652165333713e-06, + "long_answer_loss": 0.0935, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 2110, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0931, + "grad_norm": 1.453125, + "learning_rate": 2.365496450499623e-06, + "long_answer_loss": 0.0931, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 2111, + "template_loss": 0.0 + }, + { + "epoch": 1.61, + "full_loss": 0.0826, + "grad_norm": 1.328125, + "learning_rate": 2.356443224831574e-06, + "long_answer_loss": 0.0826, + "loss": 0.0685, + "short_answer_loss": NaN, + "step": 2112, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0823, + "grad_norm": 1.390625, + "learning_rate": 2.3474055534115495e-06, + "long_answer_loss": 0.0823, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2113, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.072, + "grad_norm": 1.484375, + "learning_rate": 2.338383450098021e-06, + "long_answer_loss": 0.072, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2114, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0791, + "grad_norm": 1.4609375, + "learning_rate": 2.3293769287255797e-06, + "long_answer_loss": 0.0791, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 2115, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0703, + "grad_norm": 1.421875, + "learning_rate": 2.3203860031049423e-06, + "long_answer_loss": 0.0703, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 2116, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0668, + "grad_norm": 1.3828125, + "learning_rate": 2.311410687022884e-06, + "long_answer_loss": 0.0668, + "loss": 0.0682, + "short_answer_loss": NaN, + "step": 2117, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0626, + "grad_norm": 1.453125, + "learning_rate": 2.302450994242275e-06, + "long_answer_loss": 0.0626, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2118, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0718, + "grad_norm": 1.4375, + "learning_rate": 2.2935069385020005e-06, + "long_answer_loss": 0.0718, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2119, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0672, + "grad_norm": 1.4296875, + "learning_rate": 2.2845785335169832e-06, + "long_answer_loss": 0.0672, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 2120, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0851, + "grad_norm": 1.4921875, + "learning_rate": 2.275665792978145e-06, + "long_answer_loss": 0.0851, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 2121, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0813, + "grad_norm": 1.4765625, + "learning_rate": 2.2667687305523836e-06, + "long_answer_loss": 0.0813, + "loss": 0.0806, + "short_answer_loss": NaN, + "step": 2122, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0648, + "grad_norm": 1.421875, + "learning_rate": 2.257887359882563e-06, + "long_answer_loss": 0.0648, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2123, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.067, + "grad_norm": 1.3828125, + "learning_rate": 2.249021694587471e-06, + "long_answer_loss": 0.067, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2124, + "template_loss": 0.0 + }, + { + "epoch": 1.62, + "full_loss": 0.0626, + "grad_norm": 1.453125, + "learning_rate": 2.2401717482618325e-06, + "long_answer_loss": 0.0626, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 2125, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0802, + "grad_norm": 1.46875, + "learning_rate": 2.2313375344762465e-06, + "long_answer_loss": 0.0802, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2126, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.073, + "grad_norm": 1.3203125, + "learning_rate": 2.2225190667772135e-06, + "long_answer_loss": 0.073, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2127, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0644, + "grad_norm": 1.34375, + "learning_rate": 2.213716358687064e-06, + "long_answer_loss": 0.0644, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2128, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0668, + "grad_norm": 1.3828125, + "learning_rate": 2.2049294237039745e-06, + "long_answer_loss": 0.0668, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2129, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0681, + "grad_norm": 1.328125, + "learning_rate": 2.1961582753019365e-06, + "long_answer_loss": 0.0681, + "loss": 0.0667, + "short_answer_loss": NaN, + "step": 2130, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0734, + "grad_norm": 1.4296875, + "learning_rate": 2.1874029269307277e-06, + "long_answer_loss": 0.0734, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2131, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0933, + "grad_norm": 1.4609375, + "learning_rate": 2.1786633920159045e-06, + "long_answer_loss": 0.0933, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 2132, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0676, + "grad_norm": 1.4375, + "learning_rate": 2.1699396839587687e-06, + "long_answer_loss": 0.0676, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 2133, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0697, + "grad_norm": 1.3671875, + "learning_rate": 2.161231816136361e-06, + "long_answer_loss": 0.0697, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2134, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0687, + "grad_norm": 1.46875, + "learning_rate": 2.1525398019014197e-06, + "long_answer_loss": 0.0687, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2135, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0676, + "grad_norm": 1.421875, + "learning_rate": 2.1438636545823843e-06, + "long_answer_loss": 0.0676, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2136, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0784, + "grad_norm": 1.34375, + "learning_rate": 2.13520338748336e-06, + "long_answer_loss": 0.0784, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2137, + "template_loss": 0.0 + }, + { + "epoch": 1.63, + "full_loss": 0.0865, + "grad_norm": 1.6171875, + "learning_rate": 2.126559013884101e-06, + "long_answer_loss": 0.0865, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2138, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0733, + "grad_norm": 1.296875, + "learning_rate": 2.1179305470399897e-06, + "long_answer_loss": 0.0733, + "loss": 0.0669, + "short_answer_loss": NaN, + "step": 2139, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0648, + "grad_norm": 1.4140625, + "learning_rate": 2.109318000182019e-06, + "long_answer_loss": 0.0648, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2140, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0807, + "grad_norm": 1.4375, + "learning_rate": 2.1007213865167684e-06, + "long_answer_loss": 0.0807, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2141, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.068, + "grad_norm": 1.4375, + "learning_rate": 2.0921407192263876e-06, + "long_answer_loss": 0.068, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 2142, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0539, + "grad_norm": 1.5234375, + "learning_rate": 2.083576011468562e-06, + "long_answer_loss": 0.0539, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2143, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0557, + "grad_norm": 1.3984375, + "learning_rate": 2.0750272763765276e-06, + "long_answer_loss": 0.0557, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2144, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0758, + "grad_norm": 1.3984375, + "learning_rate": 2.066494527059004e-06, + "long_answer_loss": 0.0758, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2145, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0755, + "grad_norm": 1.5078125, + "learning_rate": 2.057977776600213e-06, + "long_answer_loss": 0.0755, + "loss": 0.0864, + "short_answer_loss": NaN, + "step": 2146, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0617, + "grad_norm": 1.4765625, + "learning_rate": 2.049477038059838e-06, + "long_answer_loss": 0.0617, + "loss": 0.0688, + "short_answer_loss": NaN, + "step": 2147, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.077, + "grad_norm": 1.3828125, + "learning_rate": 2.040992324473011e-06, + "long_answer_loss": 0.077, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2148, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0776, + "grad_norm": 1.4375, + "learning_rate": 2.0325236488502888e-06, + "long_answer_loss": 0.0776, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2149, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0669, + "grad_norm": 1.390625, + "learning_rate": 2.0240710241776386e-06, + "long_answer_loss": 0.0669, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2150, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0843, + "grad_norm": 1.3828125, + "learning_rate": 2.0156344634164175e-06, + "long_answer_loss": 0.0843, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2151, + "template_loss": 0.0 + }, + { + "epoch": 1.64, + "full_loss": 0.0759, + "grad_norm": 1.4765625, + "learning_rate": 2.0072139795033333e-06, + "long_answer_loss": 0.0759, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2152, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0722, + "grad_norm": 1.3125, + "learning_rate": 1.9988095853504694e-06, + "long_answer_loss": 0.0722, + "loss": 0.0656, + "short_answer_loss": NaN, + "step": 2153, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0819, + "grad_norm": 1.4140625, + "learning_rate": 1.9904212938452128e-06, + "long_answer_loss": 0.0819, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2154, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0718, + "grad_norm": 1.4375, + "learning_rate": 1.982049117850268e-06, + "long_answer_loss": 0.0718, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 2155, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0736, + "grad_norm": 1.328125, + "learning_rate": 1.973693070203628e-06, + "long_answer_loss": 0.0736, + "loss": 0.0668, + "short_answer_loss": NaN, + "step": 2156, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.065, + "grad_norm": 1.3125, + "learning_rate": 1.9653531637185545e-06, + "long_answer_loss": 0.065, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2157, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0529, + "grad_norm": 1.3984375, + "learning_rate": 1.9570294111835585e-06, + "long_answer_loss": 0.0529, + "loss": 0.0656, + "short_answer_loss": NaN, + "step": 2158, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0708, + "grad_norm": 1.515625, + "learning_rate": 1.948721825362372e-06, + "long_answer_loss": 0.0708, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 2159, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0815, + "grad_norm": 1.3984375, + "learning_rate": 1.9404304189939547e-06, + "long_answer_loss": 0.0815, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2160, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0794, + "grad_norm": 1.421875, + "learning_rate": 1.9321552047924324e-06, + "long_answer_loss": 0.0794, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2161, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0801, + "grad_norm": 1.4140625, + "learning_rate": 1.9238961954471294e-06, + "long_answer_loss": 0.0801, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2162, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0746, + "grad_norm": 1.3828125, + "learning_rate": 1.915653403622497e-06, + "long_answer_loss": 0.0746, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 2163, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0718, + "grad_norm": 1.34375, + "learning_rate": 1.9074268419581294e-06, + "long_answer_loss": 0.0718, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 2164, + "template_loss": 0.0 + }, + { + "epoch": 1.65, + "full_loss": 0.0748, + "grad_norm": 1.5, + "learning_rate": 1.8992165230687336e-06, + "long_answer_loss": 0.0748, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 2165, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0721, + "grad_norm": 1.2890625, + "learning_rate": 1.891022459544109e-06, + "long_answer_loss": 0.0721, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2166, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0715, + "grad_norm": 1.4140625, + "learning_rate": 1.8828446639491279e-06, + "long_answer_loss": 0.0715, + "loss": 0.0682, + "short_answer_loss": NaN, + "step": 2167, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0796, + "grad_norm": 1.3359375, + "learning_rate": 1.874683148823711e-06, + "long_answer_loss": 0.0796, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2168, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0613, + "grad_norm": 1.3984375, + "learning_rate": 1.866537926682832e-06, + "long_answer_loss": 0.0613, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2169, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0766, + "grad_norm": 1.4609375, + "learning_rate": 1.858409010016457e-06, + "long_answer_loss": 0.0766, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2170, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0596, + "grad_norm": 1.3515625, + "learning_rate": 1.8502964112895731e-06, + "long_answer_loss": 0.0596, + "loss": 0.0669, + "short_answer_loss": NaN, + "step": 2171, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0599, + "grad_norm": 1.375, + "learning_rate": 1.8422001429421257e-06, + "long_answer_loss": 0.0599, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2172, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0727, + "grad_norm": 1.4765625, + "learning_rate": 1.8341202173890292e-06, + "long_answer_loss": 0.0727, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2173, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0634, + "grad_norm": 1.4140625, + "learning_rate": 1.8260566470201343e-06, + "long_answer_loss": 0.0634, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2174, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0845, + "grad_norm": 1.3515625, + "learning_rate": 1.8180094442002165e-06, + "long_answer_loss": 0.0845, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2175, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0824, + "grad_norm": 1.3984375, + "learning_rate": 1.8099786212689498e-06, + "long_answer_loss": 0.0824, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2176, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.068, + "grad_norm": 1.34375, + "learning_rate": 1.8019641905408862e-06, + "long_answer_loss": 0.068, + "loss": 0.0681, + "short_answer_loss": NaN, + "step": 2177, + "template_loss": 0.0 + }, + { + "epoch": 1.66, + "full_loss": 0.0848, + "grad_norm": 1.5, + "learning_rate": 1.7939661643054564e-06, + "long_answer_loss": 0.0848, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2178, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0738, + "grad_norm": 1.4140625, + "learning_rate": 1.7859845548269193e-06, + "long_answer_loss": 0.0738, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2179, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0664, + "grad_norm": 1.4296875, + "learning_rate": 1.7780193743443697e-06, + "long_answer_loss": 0.0664, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2180, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0576, + "grad_norm": 1.40625, + "learning_rate": 1.7700706350717093e-06, + "long_answer_loss": 0.0576, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2181, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0693, + "grad_norm": 1.3984375, + "learning_rate": 1.7621383491976256e-06, + "long_answer_loss": 0.0693, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2182, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0641, + "grad_norm": 1.4296875, + "learning_rate": 1.7542225288855796e-06, + "long_answer_loss": 0.0641, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2183, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0667, + "grad_norm": 1.3671875, + "learning_rate": 1.7463231862737822e-06, + "long_answer_loss": 0.0667, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2184, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.068, + "grad_norm": 1.5, + "learning_rate": 1.7384403334751802e-06, + "long_answer_loss": 0.068, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2185, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0851, + "grad_norm": 1.3828125, + "learning_rate": 1.7305739825774228e-06, + "long_answer_loss": 0.0851, + "loss": 0.0663, + "short_answer_loss": NaN, + "step": 2186, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0742, + "grad_norm": 1.453125, + "learning_rate": 1.722724145642876e-06, + "long_answer_loss": 0.0742, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2187, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0611, + "grad_norm": 1.4296875, + "learning_rate": 1.7148908347085616e-06, + "long_answer_loss": 0.0611, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2188, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0835, + "grad_norm": 1.4140625, + "learning_rate": 1.7070740617861736e-06, + "long_answer_loss": 0.0835, + "loss": 0.066, + "short_answer_loss": NaN, + "step": 2189, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0747, + "grad_norm": 1.375, + "learning_rate": 1.6992738388620408e-06, + "long_answer_loss": 0.0747, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2190, + "template_loss": 0.0 + }, + { + "epoch": 1.67, + "full_loss": 0.0906, + "grad_norm": 1.3671875, + "learning_rate": 1.691490177897119e-06, + "long_answer_loss": 0.0906, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 2191, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0859, + "grad_norm": 1.4375, + "learning_rate": 1.6837230908269623e-06, + "long_answer_loss": 0.0859, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 2192, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.1052, + "grad_norm": 1.5703125, + "learning_rate": 1.6759725895617113e-06, + "long_answer_loss": 0.1052, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 2193, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0771, + "grad_norm": 1.4140625, + "learning_rate": 1.6682386859860774e-06, + "long_answer_loss": 0.0771, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2194, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.081, + "grad_norm": 1.40625, + "learning_rate": 1.66052139195932e-06, + "long_answer_loss": 0.081, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2195, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0525, + "grad_norm": 1.3515625, + "learning_rate": 1.6528207193152235e-06, + "long_answer_loss": 0.0525, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 2196, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0649, + "grad_norm": 1.4296875, + "learning_rate": 1.6451366798620888e-06, + "long_answer_loss": 0.0649, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2197, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0549, + "grad_norm": 1.4140625, + "learning_rate": 1.637469285382713e-06, + "long_answer_loss": 0.0549, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 2198, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0653, + "grad_norm": 1.375, + "learning_rate": 1.6298185476343693e-06, + "long_answer_loss": 0.0653, + "loss": 0.077, + "short_answer_loss": NaN, + "step": 2199, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0654, + "grad_norm": 1.375, + "learning_rate": 1.6221844783487859e-06, + "long_answer_loss": 0.0654, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2200, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0929, + "grad_norm": 1.4609375, + "learning_rate": 1.6145670892321344e-06, + "long_answer_loss": 0.0929, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 2201, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0737, + "grad_norm": 1.4140625, + "learning_rate": 1.6069663919650077e-06, + "long_answer_loss": 0.0737, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2202, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0896, + "grad_norm": 1.3828125, + "learning_rate": 1.5993823982024036e-06, + "long_answer_loss": 0.0896, + "loss": 0.0789, + "short_answer_loss": NaN, + "step": 2203, + "template_loss": 0.0 + }, + { + "epoch": 1.68, + "full_loss": 0.0711, + "grad_norm": 1.453125, + "learning_rate": 1.5918151195737099e-06, + "long_answer_loss": 0.0711, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 2204, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0827, + "grad_norm": 1.34375, + "learning_rate": 1.584264567682671e-06, + "long_answer_loss": 0.0827, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2205, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0703, + "grad_norm": 1.4140625, + "learning_rate": 1.5767307541074015e-06, + "long_answer_loss": 0.0703, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2206, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0742, + "grad_norm": 1.40625, + "learning_rate": 1.5692136904003298e-06, + "long_answer_loss": 0.0742, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2207, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0759, + "grad_norm": 1.4609375, + "learning_rate": 1.5617133880882137e-06, + "long_answer_loss": 0.0759, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2208, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0649, + "grad_norm": 1.2734375, + "learning_rate": 1.554229858672103e-06, + "long_answer_loss": 0.0649, + "loss": 0.0667, + "short_answer_loss": NaN, + "step": 2209, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0697, + "grad_norm": 1.3515625, + "learning_rate": 1.5467631136273294e-06, + "long_answer_loss": 0.0697, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2210, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0657, + "grad_norm": 1.359375, + "learning_rate": 1.5393131644034885e-06, + "long_answer_loss": 0.0657, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 2211, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0683, + "grad_norm": 1.4453125, + "learning_rate": 1.5318800224244118e-06, + "long_answer_loss": 0.0683, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2212, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0544, + "grad_norm": 1.3125, + "learning_rate": 1.5244636990881758e-06, + "long_answer_loss": 0.0544, + "loss": 0.0644, + "short_answer_loss": NaN, + "step": 2213, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0736, + "grad_norm": 1.3828125, + "learning_rate": 1.5170642057670465e-06, + "long_answer_loss": 0.0736, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 2214, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.09, + "grad_norm": 1.390625, + "learning_rate": 1.5096815538075043e-06, + "long_answer_loss": 0.09, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2215, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0683, + "grad_norm": 1.3671875, + "learning_rate": 1.5023157545301854e-06, + "long_answer_loss": 0.0683, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 2216, + "template_loss": 0.0 + }, + { + "epoch": 1.69, + "full_loss": 0.0774, + "grad_norm": 1.390625, + "learning_rate": 1.4949668192298942e-06, + "long_answer_loss": 0.0774, + "loss": 0.0765, + "short_answer_loss": NaN, + "step": 2217, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0556, + "grad_norm": 1.328125, + "learning_rate": 1.487634759175574e-06, + "long_answer_loss": 0.0556, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2218, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0826, + "grad_norm": 1.3671875, + "learning_rate": 1.4803195856102917e-06, + "long_answer_loss": 0.0826, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 2219, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0693, + "grad_norm": 1.3984375, + "learning_rate": 1.4730213097512213e-06, + "long_answer_loss": 0.0693, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2220, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0746, + "grad_norm": 1.3828125, + "learning_rate": 1.4657399427896152e-06, + "long_answer_loss": 0.0746, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2221, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0711, + "grad_norm": 1.484375, + "learning_rate": 1.4584754958908195e-06, + "long_answer_loss": 0.0711, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2222, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0624, + "grad_norm": 1.375, + "learning_rate": 1.4512279801942099e-06, + "long_answer_loss": 0.0624, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 2223, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0728, + "grad_norm": 1.4375, + "learning_rate": 1.4439974068132204e-06, + "long_answer_loss": 0.0728, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2224, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0729, + "grad_norm": 1.375, + "learning_rate": 1.43678378683529e-06, + "long_answer_loss": 0.0729, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2225, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0759, + "grad_norm": 1.421875, + "learning_rate": 1.4295871313218702e-06, + "long_answer_loss": 0.0759, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2226, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0684, + "grad_norm": 1.3984375, + "learning_rate": 1.4224074513083983e-06, + "long_answer_loss": 0.0684, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2227, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0797, + "grad_norm": 1.375, + "learning_rate": 1.415244757804271e-06, + "long_answer_loss": 0.0797, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2228, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0756, + "grad_norm": 1.359375, + "learning_rate": 1.4080990617928571e-06, + "long_answer_loss": 0.0756, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2229, + "template_loss": 0.0 + }, + { + "epoch": 1.7, + "full_loss": 0.0881, + "grad_norm": 1.4609375, + "learning_rate": 1.4009703742314404e-06, + "long_answer_loss": 0.0881, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 2230, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0692, + "grad_norm": 1.421875, + "learning_rate": 1.3938587060512417e-06, + "long_answer_loss": 0.0692, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 2231, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0805, + "grad_norm": 1.390625, + "learning_rate": 1.3867640681573687e-06, + "long_answer_loss": 0.0805, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2232, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0745, + "grad_norm": 1.3515625, + "learning_rate": 1.379686471428826e-06, + "long_answer_loss": 0.0745, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 2233, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0676, + "grad_norm": 1.359375, + "learning_rate": 1.3726259267184807e-06, + "long_answer_loss": 0.0676, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2234, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0785, + "grad_norm": 1.4296875, + "learning_rate": 1.3655824448530557e-06, + "long_answer_loss": 0.0785, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 2235, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0802, + "grad_norm": 1.546875, + "learning_rate": 1.35855603663311e-06, + "long_answer_loss": 0.0802, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2236, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.068, + "grad_norm": 1.4609375, + "learning_rate": 1.3515467128330115e-06, + "long_answer_loss": 0.068, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 2237, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.061, + "grad_norm": 1.3125, + "learning_rate": 1.3445544842009493e-06, + "long_answer_loss": 0.061, + "loss": 0.0661, + "short_answer_loss": NaN, + "step": 2238, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0749, + "grad_norm": 1.4765625, + "learning_rate": 1.3375793614588794e-06, + "long_answer_loss": 0.0749, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2239, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0687, + "grad_norm": 1.375, + "learning_rate": 1.3306213553025444e-06, + "long_answer_loss": 0.0687, + "loss": 0.0681, + "short_answer_loss": NaN, + "step": 2240, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0632, + "grad_norm": 1.375, + "learning_rate": 1.323680476401426e-06, + "long_answer_loss": 0.0632, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2241, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0636, + "grad_norm": 1.4453125, + "learning_rate": 1.3167567353987498e-06, + "long_answer_loss": 0.0636, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2242, + "template_loss": 0.0 + }, + { + "epoch": 1.71, + "full_loss": 0.0698, + "grad_norm": 1.3046875, + "learning_rate": 1.3098501429114618e-06, + "long_answer_loss": 0.0698, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 2243, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0659, + "grad_norm": 1.34375, + "learning_rate": 1.3029607095302112e-06, + "long_answer_loss": 0.0659, + "loss": 0.0665, + "short_answer_loss": NaN, + "step": 2244, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0689, + "grad_norm": 1.3828125, + "learning_rate": 1.296088445819335e-06, + "long_answer_loss": 0.0689, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 2245, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0667, + "grad_norm": 1.3359375, + "learning_rate": 1.2892333623168426e-06, + "long_answer_loss": 0.0667, + "loss": 0.0668, + "short_answer_loss": NaN, + "step": 2246, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0643, + "grad_norm": 1.4375, + "learning_rate": 1.2823954695344005e-06, + "long_answer_loss": 0.0643, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2247, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0776, + "grad_norm": 1.453125, + "learning_rate": 1.2755747779573099e-06, + "long_answer_loss": 0.0776, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2248, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0697, + "grad_norm": 1.421875, + "learning_rate": 1.2687712980444994e-06, + "long_answer_loss": 0.0697, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2249, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0537, + "grad_norm": 1.296875, + "learning_rate": 1.2619850402285054e-06, + "long_answer_loss": 0.0537, + "loss": 0.0644, + "short_answer_loss": NaN, + "step": 2250, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0674, + "grad_norm": 1.3984375, + "learning_rate": 1.255216014915453e-06, + "long_answer_loss": 0.0674, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2251, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0817, + "grad_norm": 1.3828125, + "learning_rate": 1.2484642324850471e-06, + "long_answer_loss": 0.0817, + "loss": 0.0752, + "short_answer_loss": NaN, + "step": 2252, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0771, + "grad_norm": 1.4765625, + "learning_rate": 1.2417297032905465e-06, + "long_answer_loss": 0.0771, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2253, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0703, + "grad_norm": 1.3359375, + "learning_rate": 1.23501243765876e-06, + "long_answer_loss": 0.0703, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2254, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.0677, + "grad_norm": 1.5, + "learning_rate": 1.2283124458900202e-06, + "long_answer_loss": 0.0677, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2255, + "template_loss": 0.0 + }, + { + "epoch": 1.72, + "full_loss": 0.066, + "grad_norm": 1.4921875, + "learning_rate": 1.2216297382581663e-06, + "long_answer_loss": 0.066, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2256, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0787, + "grad_norm": 1.453125, + "learning_rate": 1.2149643250105495e-06, + "long_answer_loss": 0.0787, + "loss": 0.076, + "short_answer_loss": NaN, + "step": 2257, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0715, + "grad_norm": 1.390625, + "learning_rate": 1.2083162163679857e-06, + "long_answer_loss": 0.0715, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2258, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0668, + "grad_norm": 1.375, + "learning_rate": 1.2016854225247633e-06, + "long_answer_loss": 0.0668, + "loss": 0.068, + "short_answer_loss": NaN, + "step": 2259, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0817, + "grad_norm": 1.3359375, + "learning_rate": 1.1950719536486201e-06, + "long_answer_loss": 0.0817, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2260, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0673, + "grad_norm": 1.4140625, + "learning_rate": 1.1884758198807258e-06, + "long_answer_loss": 0.0673, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2261, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.081, + "grad_norm": 1.4375, + "learning_rate": 1.1818970313356673e-06, + "long_answer_loss": 0.081, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2262, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0797, + "grad_norm": 1.3984375, + "learning_rate": 1.1753355981014374e-06, + "long_answer_loss": 0.0797, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2263, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.075, + "grad_norm": 1.390625, + "learning_rate": 1.1687915302394144e-06, + "long_answer_loss": 0.075, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 2264, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0692, + "grad_norm": 1.390625, + "learning_rate": 1.1622648377843437e-06, + "long_answer_loss": 0.0692, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2265, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0967, + "grad_norm": 1.4140625, + "learning_rate": 1.1557555307443387e-06, + "long_answer_loss": 0.0967, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 2266, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0723, + "grad_norm": 1.4453125, + "learning_rate": 1.1492636191008418e-06, + "long_answer_loss": 0.0723, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 2267, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.066, + "grad_norm": 1.328125, + "learning_rate": 1.142789112808626e-06, + "long_answer_loss": 0.066, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 2268, + "template_loss": 0.0 + }, + { + "epoch": 1.73, + "full_loss": 0.0841, + "grad_norm": 1.34375, + "learning_rate": 1.1363320217957746e-06, + "long_answer_loss": 0.0841, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 2269, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0717, + "grad_norm": 1.3671875, + "learning_rate": 1.1298923559636686e-06, + "long_answer_loss": 0.0717, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2270, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0853, + "grad_norm": 1.4921875, + "learning_rate": 1.1234701251869665e-06, + "long_answer_loss": 0.0853, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 2271, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0811, + "grad_norm": 1.421875, + "learning_rate": 1.1170653393135847e-06, + "long_answer_loss": 0.0811, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2272, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0791, + "grad_norm": 1.390625, + "learning_rate": 1.1106780081647075e-06, + "long_answer_loss": 0.0791, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 2273, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0606, + "grad_norm": 1.3359375, + "learning_rate": 1.1043081415347323e-06, + "long_answer_loss": 0.0606, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 2274, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0778, + "grad_norm": 1.453125, + "learning_rate": 1.0979557491912956e-06, + "long_answer_loss": 0.0778, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2275, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0708, + "grad_norm": 1.390625, + "learning_rate": 1.0916208408752237e-06, + "long_answer_loss": 0.0708, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2276, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0571, + "grad_norm": 1.3515625, + "learning_rate": 1.085303426300542e-06, + "long_answer_loss": 0.0571, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 2277, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0834, + "grad_norm": 1.375, + "learning_rate": 1.0790035151544447e-06, + "long_answer_loss": 0.0834, + "loss": 0.0685, + "short_answer_loss": NaN, + "step": 2278, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0931, + "grad_norm": 1.453125, + "learning_rate": 1.0727211170972916e-06, + "long_answer_loss": 0.0931, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 2279, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0787, + "grad_norm": 1.5, + "learning_rate": 1.0664562417625853e-06, + "long_answer_loss": 0.0787, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 2280, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0684, + "grad_norm": 1.390625, + "learning_rate": 1.0602088987569537e-06, + "long_answer_loss": 0.0684, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2281, + "template_loss": 0.0 + }, + { + "epoch": 1.74, + "full_loss": 0.0831, + "grad_norm": 1.3515625, + "learning_rate": 1.053979097660153e-06, + "long_answer_loss": 0.0831, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2282, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0878, + "grad_norm": 1.40625, + "learning_rate": 1.0477668480250239e-06, + "long_answer_loss": 0.0878, + "loss": 0.078, + "short_answer_loss": NaN, + "step": 2283, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0787, + "grad_norm": 1.4296875, + "learning_rate": 1.0415721593775101e-06, + "long_answer_loss": 0.0787, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 2284, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0845, + "grad_norm": 1.4609375, + "learning_rate": 1.0353950412166149e-06, + "long_answer_loss": 0.0845, + "loss": 0.0782, + "short_answer_loss": NaN, + "step": 2285, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0677, + "grad_norm": 1.34375, + "learning_rate": 1.0292355030144044e-06, + "long_answer_loss": 0.0677, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2286, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0702, + "grad_norm": 1.453125, + "learning_rate": 1.0230935542159855e-06, + "long_answer_loss": 0.0702, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2287, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.068, + "grad_norm": 1.4296875, + "learning_rate": 1.0169692042394957e-06, + "long_answer_loss": 0.068, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2288, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0702, + "grad_norm": 1.5078125, + "learning_rate": 1.0108624624760852e-06, + "long_answer_loss": 0.0702, + "loss": 0.0791, + "short_answer_loss": NaN, + "step": 2289, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0754, + "grad_norm": 1.421875, + "learning_rate": 1.0047733382898966e-06, + "long_answer_loss": 0.0754, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2290, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.071, + "grad_norm": 1.3828125, + "learning_rate": 9.987018410180724e-07, + "long_answer_loss": 0.071, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2291, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0671, + "grad_norm": 1.328125, + "learning_rate": 9.926479799707109e-07, + "long_answer_loss": 0.0671, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 2292, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0811, + "grad_norm": 1.4296875, + "learning_rate": 9.866117644308754e-07, + "long_answer_loss": 0.0811, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 2293, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0713, + "grad_norm": 1.3671875, + "learning_rate": 9.805932036545686e-07, + "long_answer_loss": 0.0713, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2294, + "template_loss": 0.0 + }, + { + "epoch": 1.75, + "full_loss": 0.0806, + "grad_norm": 1.3671875, + "learning_rate": 9.745923068707225e-07, + "long_answer_loss": 0.0806, + "loss": 0.0746, + "short_answer_loss": NaN, + "step": 2295, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0974, + "grad_norm": 1.3984375, + "learning_rate": 9.68609083281183e-07, + "long_answer_loss": 0.0974, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2296, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0903, + "grad_norm": 1.4140625, + "learning_rate": 9.626435420606913e-07, + "long_answer_loss": 0.0903, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2297, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0622, + "grad_norm": 1.40625, + "learning_rate": 9.56695692356882e-07, + "long_answer_loss": 0.0622, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2298, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0697, + "grad_norm": 1.390625, + "learning_rate": 9.50765543290251e-07, + "long_answer_loss": 0.0697, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 2299, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0724, + "grad_norm": 1.3515625, + "learning_rate": 9.448531039541672e-07, + "long_answer_loss": 0.0724, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2300, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0762, + "grad_norm": 1.328125, + "learning_rate": 9.389583834148244e-07, + "long_answer_loss": 0.0762, + "loss": 0.0678, + "short_answer_loss": NaN, + "step": 2301, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0749, + "grad_norm": 1.375, + "learning_rate": 9.330813907112615e-07, + "long_answer_loss": 0.0749, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2302, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0785, + "grad_norm": 1.453125, + "learning_rate": 9.272221348553253e-07, + "long_answer_loss": 0.0785, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2303, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.083, + "grad_norm": 1.40625, + "learning_rate": 9.213806248316664e-07, + "long_answer_loss": 0.083, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2304, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0761, + "grad_norm": 1.4375, + "learning_rate": 9.155568695977265e-07, + "long_answer_loss": 0.0761, + "loss": 0.067, + "short_answer_loss": NaN, + "step": 2305, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0793, + "grad_norm": 1.40625, + "learning_rate": 9.097508780837177e-07, + "long_answer_loss": 0.0793, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 2306, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.077, + "grad_norm": 1.40625, + "learning_rate": 9.039626591926156e-07, + "long_answer_loss": 0.077, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 2307, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0782, + "grad_norm": 1.3203125, + "learning_rate": 8.981922218001454e-07, + "long_answer_loss": 0.0782, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2308, + "template_loss": 0.0 + }, + { + "epoch": 1.76, + "full_loss": 0.0694, + "grad_norm": 1.4609375, + "learning_rate": 8.924395747547568e-07, + "long_answer_loss": 0.0694, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2309, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0663, + "grad_norm": 1.3203125, + "learning_rate": 8.867047268776296e-07, + "long_answer_loss": 0.0663, + "loss": 0.0632, + "short_answer_loss": NaN, + "step": 2310, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0783, + "grad_norm": 1.359375, + "learning_rate": 8.809876869626463e-07, + "long_answer_loss": 0.0783, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2311, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0762, + "grad_norm": 1.4140625, + "learning_rate": 8.752884637763817e-07, + "long_answer_loss": 0.0762, + "loss": 0.0665, + "short_answer_loss": NaN, + "step": 2312, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0575, + "grad_norm": 1.390625, + "learning_rate": 8.69607066058091e-07, + "long_answer_loss": 0.0575, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2313, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0759, + "grad_norm": 1.4765625, + "learning_rate": 8.639435025196957e-07, + "long_answer_loss": 0.0759, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2314, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0818, + "grad_norm": 1.453125, + "learning_rate": 8.582977818457696e-07, + "long_answer_loss": 0.0818, + "loss": 0.0755, + "short_answer_loss": NaN, + "step": 2315, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0825, + "grad_norm": 1.4140625, + "learning_rate": 8.526699126935267e-07, + "long_answer_loss": 0.0825, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2316, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0792, + "grad_norm": 1.4453125, + "learning_rate": 8.470599036928096e-07, + "long_answer_loss": 0.0792, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 2317, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.074, + "grad_norm": 1.390625, + "learning_rate": 8.41467763446066e-07, + "long_answer_loss": 0.074, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 2318, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0812, + "grad_norm": 1.4296875, + "learning_rate": 8.35893500528355e-07, + "long_answer_loss": 0.0812, + "loss": 0.0783, + "short_answer_loss": NaN, + "step": 2319, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0702, + "grad_norm": 1.5390625, + "learning_rate": 8.303371234873111e-07, + "long_answer_loss": 0.0702, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2320, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.0743, + "grad_norm": 1.390625, + "learning_rate": 8.24798640843151e-07, + "long_answer_loss": 0.0743, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2321, + "template_loss": 0.0 + }, + { + "epoch": 1.77, + "full_loss": 0.061, + "grad_norm": 1.34375, + "learning_rate": 8.192780610886449e-07, + "long_answer_loss": 0.061, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 2322, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0864, + "grad_norm": 1.5078125, + "learning_rate": 8.137753926891187e-07, + "long_answer_loss": 0.0864, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 2323, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0767, + "grad_norm": 1.390625, + "learning_rate": 8.082906440824253e-07, + "long_answer_loss": 0.0767, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2324, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0553, + "grad_norm": 1.296875, + "learning_rate": 8.028238236789401e-07, + "long_answer_loss": 0.0553, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2325, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0551, + "grad_norm": 1.3515625, + "learning_rate": 7.973749398615546e-07, + "long_answer_loss": 0.0551, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 2326, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0656, + "grad_norm": 1.3359375, + "learning_rate": 7.919440009856436e-07, + "long_answer_loss": 0.0656, + "loss": 0.0691, + "short_answer_loss": NaN, + "step": 2327, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.069, + "grad_norm": 1.390625, + "learning_rate": 7.8653101537908e-07, + "long_answer_loss": 0.069, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2328, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0673, + "grad_norm": 1.4140625, + "learning_rate": 7.811359913421939e-07, + "long_answer_loss": 0.0673, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2329, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0649, + "grad_norm": 1.5, + "learning_rate": 7.757589371477775e-07, + "long_answer_loss": 0.0649, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2330, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0715, + "grad_norm": 1.421875, + "learning_rate": 7.703998610410712e-07, + "long_answer_loss": 0.0715, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2331, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0708, + "grad_norm": 1.4140625, + "learning_rate": 7.650587712397419e-07, + "long_answer_loss": 0.0708, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 2332, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0691, + "grad_norm": 1.390625, + "learning_rate": 7.597356759338828e-07, + "long_answer_loss": 0.0691, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 2333, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0673, + "grad_norm": 1.4609375, + "learning_rate": 7.544305832859825e-07, + "long_answer_loss": 0.0673, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2334, + "template_loss": 0.0 + }, + { + "epoch": 1.78, + "full_loss": 0.0868, + "grad_norm": 1.375, + "learning_rate": 7.49143501430942e-07, + "long_answer_loss": 0.0868, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2335, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0652, + "grad_norm": 1.390625, + "learning_rate": 7.438744384760249e-07, + "long_answer_loss": 0.0652, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 2336, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0782, + "grad_norm": 1.4375, + "learning_rate": 7.38623402500882e-07, + "long_answer_loss": 0.0782, + "loss": 0.075, + "short_answer_loss": NaN, + "step": 2337, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0669, + "grad_norm": 1.3828125, + "learning_rate": 7.333904015575058e-07, + "long_answer_loss": 0.0669, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2338, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0818, + "grad_norm": 1.3671875, + "learning_rate": 7.28175443670244e-07, + "long_answer_loss": 0.0818, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2339, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0596, + "grad_norm": 1.375, + "learning_rate": 7.229785368357764e-07, + "long_answer_loss": 0.0596, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2340, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0786, + "grad_norm": 1.4453125, + "learning_rate": 7.177996890230937e-07, + "long_answer_loss": 0.0786, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2341, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0687, + "grad_norm": 1.375, + "learning_rate": 7.126389081735075e-07, + "long_answer_loss": 0.0687, + "loss": 0.0688, + "short_answer_loss": NaN, + "step": 2342, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0766, + "grad_norm": 1.453125, + "learning_rate": 7.074962022006151e-07, + "long_answer_loss": 0.0766, + "loss": 0.0788, + "short_answer_loss": NaN, + "step": 2343, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0589, + "grad_norm": 1.3828125, + "learning_rate": 7.02371578990306e-07, + "long_answer_loss": 0.0589, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 2344, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.083, + "grad_norm": 1.421875, + "learning_rate": 6.972650464007344e-07, + "long_answer_loss": 0.083, + "loss": 0.0724, + "short_answer_loss": NaN, + "step": 2345, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.0804, + "grad_norm": 1.3984375, + "learning_rate": 6.921766122623158e-07, + "long_answer_loss": 0.0804, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 2346, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.066, + "grad_norm": 1.4296875, + "learning_rate": 6.871062843777157e-07, + "long_answer_loss": 0.066, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 2347, + "template_loss": 0.0 + }, + { + "epoch": 1.79, + "full_loss": 0.085, + "grad_norm": 1.390625, + "learning_rate": 6.820540705218343e-07, + "long_answer_loss": 0.085, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 2348, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0753, + "grad_norm": 1.3671875, + "learning_rate": 6.770199784417966e-07, + "long_answer_loss": 0.0753, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 2349, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0756, + "grad_norm": 1.40625, + "learning_rate": 6.720040158569322e-07, + "long_answer_loss": 0.0756, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2350, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0676, + "grad_norm": 1.3828125, + "learning_rate": 6.670061904587826e-07, + "long_answer_loss": 0.0676, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2351, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0473, + "grad_norm": 1.390625, + "learning_rate": 6.620265099110679e-07, + "long_answer_loss": 0.0473, + "loss": 0.0638, + "short_answer_loss": NaN, + "step": 2352, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0752, + "grad_norm": 1.53125, + "learning_rate": 6.570649818496922e-07, + "long_answer_loss": 0.0752, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2353, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.067, + "grad_norm": 1.390625, + "learning_rate": 6.521216138827155e-07, + "long_answer_loss": 0.067, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2354, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0564, + "grad_norm": 1.3515625, + "learning_rate": 6.471964135903578e-07, + "long_answer_loss": 0.0564, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 2355, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0809, + "grad_norm": 1.421875, + "learning_rate": 6.42289388524979e-07, + "long_answer_loss": 0.0809, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2356, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0749, + "grad_norm": 1.4453125, + "learning_rate": 6.374005462110685e-07, + "long_answer_loss": 0.0749, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2357, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.061, + "grad_norm": 1.3203125, + "learning_rate": 6.32529894145234e-07, + "long_answer_loss": 0.061, + "loss": 0.0678, + "short_answer_loss": NaN, + "step": 2358, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0737, + "grad_norm": 1.453125, + "learning_rate": 6.276774397961885e-07, + "long_answer_loss": 0.0737, + "loss": 0.0724, + "short_answer_loss": NaN, + "step": 2359, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0871, + "grad_norm": 1.46875, + "learning_rate": 6.228431906047467e-07, + "long_answer_loss": 0.0871, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2360, + "template_loss": 0.0 + }, + { + "epoch": 1.8, + "full_loss": 0.0753, + "grad_norm": 1.8515625, + "learning_rate": 6.180271539837954e-07, + "long_answer_loss": 0.0753, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2361, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0731, + "grad_norm": 1.3046875, + "learning_rate": 6.132293373183065e-07, + "long_answer_loss": 0.0731, + "loss": 0.0651, + "short_answer_loss": NaN, + "step": 2362, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0698, + "grad_norm": 1.359375, + "learning_rate": 6.084497479653062e-07, + "long_answer_loss": 0.0698, + "loss": 0.0682, + "short_answer_loss": NaN, + "step": 2363, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0755, + "grad_norm": 1.453125, + "learning_rate": 6.03688393253872e-07, + "long_answer_loss": 0.0755, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2364, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0718, + "grad_norm": 1.3984375, + "learning_rate": 5.989452804851206e-07, + "long_answer_loss": 0.0718, + "loss": 0.0743, + "short_answer_loss": NaN, + "step": 2365, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0754, + "grad_norm": 1.375, + "learning_rate": 5.94220416932198e-07, + "long_answer_loss": 0.0754, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2366, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0916, + "grad_norm": 1.4140625, + "learning_rate": 5.895138098402628e-07, + "long_answer_loss": 0.0916, + "loss": 0.0777, + "short_answer_loss": NaN, + "step": 2367, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0745, + "grad_norm": 1.3671875, + "learning_rate": 5.848254664264848e-07, + "long_answer_loss": 0.0745, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2368, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0879, + "grad_norm": 1.3515625, + "learning_rate": 5.801553938800192e-07, + "long_answer_loss": 0.0879, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2369, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0695, + "grad_norm": 1.40625, + "learning_rate": 5.755035993620137e-07, + "long_answer_loss": 0.0695, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2370, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0781, + "grad_norm": 1.359375, + "learning_rate": 5.708700900055819e-07, + "long_answer_loss": 0.0781, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2371, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0877, + "grad_norm": 1.5078125, + "learning_rate": 5.662548729158015e-07, + "long_answer_loss": 0.0877, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 2372, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0715, + "grad_norm": 1.359375, + "learning_rate": 5.616579551697004e-07, + "long_answer_loss": 0.0715, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2373, + "template_loss": 0.0 + }, + { + "epoch": 1.81, + "full_loss": 0.0649, + "grad_norm": 1.421875, + "learning_rate": 5.570793438162456e-07, + "long_answer_loss": 0.0649, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2374, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0812, + "grad_norm": 1.46875, + "learning_rate": 5.525190458763332e-07, + "long_answer_loss": 0.0812, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2375, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0691, + "grad_norm": 1.4609375, + "learning_rate": 5.479770683427768e-07, + "long_answer_loss": 0.0691, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 2376, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0883, + "grad_norm": 1.40625, + "learning_rate": 5.434534181803008e-07, + "long_answer_loss": 0.0883, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2377, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0847, + "grad_norm": 1.7421875, + "learning_rate": 5.389481023255149e-07, + "long_answer_loss": 0.0847, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2378, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0707, + "grad_norm": 1.328125, + "learning_rate": 5.344611276869318e-07, + "long_answer_loss": 0.0707, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2379, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0777, + "grad_norm": 1.3984375, + "learning_rate": 5.299925011449269e-07, + "long_answer_loss": 0.0777, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 2380, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0743, + "grad_norm": 1.34375, + "learning_rate": 5.255422295517426e-07, + "long_answer_loss": 0.0743, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2381, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0812, + "grad_norm": 1.3515625, + "learning_rate": 5.211103197314784e-07, + "long_answer_loss": 0.0812, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2382, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0833, + "grad_norm": 1.40625, + "learning_rate": 5.166967784800774e-07, + "long_answer_loss": 0.0833, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2383, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0688, + "grad_norm": 1.3671875, + "learning_rate": 5.123016125653163e-07, + "long_answer_loss": 0.0688, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2384, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0814, + "grad_norm": 1.421875, + "learning_rate": 5.079248287267885e-07, + "long_answer_loss": 0.0814, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2385, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0697, + "grad_norm": 1.40625, + "learning_rate": 5.035664336759116e-07, + "long_answer_loss": 0.0697, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2386, + "template_loss": 0.0 + }, + { + "epoch": 1.82, + "full_loss": 0.0608, + "grad_norm": 1.4296875, + "learning_rate": 4.992264340958924e-07, + "long_answer_loss": 0.0608, + "loss": 0.0678, + "short_answer_loss": NaN, + "step": 2387, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0744, + "grad_norm": 1.421875, + "learning_rate": 4.94904836641745e-07, + "long_answer_loss": 0.0744, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2388, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0608, + "grad_norm": 1.359375, + "learning_rate": 4.906016479402504e-07, + "long_answer_loss": 0.0608, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 2389, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0828, + "grad_norm": 1.3828125, + "learning_rate": 4.863168745899704e-07, + "long_answer_loss": 0.0828, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2390, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0727, + "grad_norm": 1.3984375, + "learning_rate": 4.820505231612274e-07, + "long_answer_loss": 0.0727, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2391, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0733, + "grad_norm": 1.3984375, + "learning_rate": 4.778026001960936e-07, + "long_answer_loss": 0.0733, + "loss": 0.0773, + "short_answer_loss": NaN, + "step": 2392, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0689, + "grad_norm": 1.4609375, + "learning_rate": 4.73573112208385e-07, + "long_answer_loss": 0.0689, + "loss": 0.0759, + "short_answer_loss": NaN, + "step": 2393, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0918, + "grad_norm": 1.578125, + "learning_rate": 4.693620656836442e-07, + "long_answer_loss": 0.0918, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2394, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0708, + "grad_norm": 1.3671875, + "learning_rate": 4.6516946707914205e-07, + "long_answer_loss": 0.0708, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2395, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.087, + "grad_norm": 1.4765625, + "learning_rate": 4.609953228238553e-07, + "long_answer_loss": 0.087, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 2396, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0608, + "grad_norm": 1.3828125, + "learning_rate": 4.568396393184696e-07, + "long_answer_loss": 0.0608, + "loss": 0.0669, + "short_answer_loss": NaN, + "step": 2397, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.07, + "grad_norm": 1.3671875, + "learning_rate": 4.527024229353541e-07, + "long_answer_loss": 0.07, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 2398, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0613, + "grad_norm": 1.328125, + "learning_rate": 4.485836800185661e-07, + "long_answer_loss": 0.0613, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2399, + "template_loss": 0.0 + }, + { + "epoch": 1.83, + "full_loss": 0.0643, + "grad_norm": 1.3359375, + "learning_rate": 4.444834168838355e-07, + "long_answer_loss": 0.0643, + "loss": 0.0688, + "short_answer_loss": NaN, + "step": 2400, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0703, + "grad_norm": 1.3828125, + "learning_rate": 4.4040163981855095e-07, + "long_answer_loss": 0.0703, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 2401, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0727, + "grad_norm": 1.5078125, + "learning_rate": 4.3633835508175987e-07, + "long_answer_loss": 0.0727, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2402, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0771, + "grad_norm": 1.4140625, + "learning_rate": 4.322935689041449e-07, + "long_answer_loss": 0.0771, + "loss": 0.0774, + "short_answer_loss": NaN, + "step": 2403, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0807, + "grad_norm": 1.4296875, + "learning_rate": 4.2826728748803504e-07, + "long_answer_loss": 0.0807, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2404, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0996, + "grad_norm": 1.296875, + "learning_rate": 4.242595170073735e-07, + "long_answer_loss": 0.0996, + "loss": 0.0681, + "short_answer_loss": NaN, + "step": 2405, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0733, + "grad_norm": 1.453125, + "learning_rate": 4.2027026360772215e-07, + "long_answer_loss": 0.0733, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 2406, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0721, + "grad_norm": 1.3203125, + "learning_rate": 4.162995334062489e-07, + "long_answer_loss": 0.0721, + "loss": 0.0672, + "short_answer_loss": NaN, + "step": 2407, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0816, + "grad_norm": 1.40625, + "learning_rate": 4.1234733249171794e-07, + "long_answer_loss": 0.0816, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2408, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0664, + "grad_norm": 1.390625, + "learning_rate": 4.084136669244801e-07, + "long_answer_loss": 0.0664, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2409, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0903, + "grad_norm": 1.4375, + "learning_rate": 4.044985427364645e-07, + "long_answer_loss": 0.0903, + "loss": 0.0765, + "short_answer_loss": NaN, + "step": 2410, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0713, + "grad_norm": 1.375, + "learning_rate": 4.0060196593116747e-07, + "long_answer_loss": 0.0713, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2411, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.0668, + "grad_norm": 1.4375, + "learning_rate": 3.9672394248364414e-07, + "long_answer_loss": 0.0668, + "loss": 0.0694, + "short_answer_loss": NaN, + "step": 2412, + "template_loss": 0.0 + }, + { + "epoch": 1.84, + "full_loss": 0.065, + "grad_norm": 1.34375, + "learning_rate": 3.9286447834050304e-07, + "long_answer_loss": 0.065, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 2413, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0767, + "grad_norm": 1.421875, + "learning_rate": 3.890235794198907e-07, + "long_answer_loss": 0.0767, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2414, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.069, + "grad_norm": 1.4765625, + "learning_rate": 3.8520125161148475e-07, + "long_answer_loss": 0.069, + "loss": 0.0803, + "short_answer_loss": NaN, + "step": 2415, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0714, + "grad_norm": 1.4375, + "learning_rate": 3.8139750077648834e-07, + "long_answer_loss": 0.0714, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2416, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.083, + "grad_norm": 1.4609375, + "learning_rate": 3.7761233274761774e-07, + "long_answer_loss": 0.083, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2417, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0671, + "grad_norm": 1.390625, + "learning_rate": 3.738457533290926e-07, + "long_answer_loss": 0.0671, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 2418, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.067, + "grad_norm": 1.46875, + "learning_rate": 3.7009776829663027e-07, + "long_answer_loss": 0.067, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 2419, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0779, + "grad_norm": 1.3671875, + "learning_rate": 3.663683833974349e-07, + "long_answer_loss": 0.0779, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 2420, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0607, + "grad_norm": 1.34375, + "learning_rate": 3.626576043501889e-07, + "long_answer_loss": 0.0607, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 2421, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0726, + "grad_norm": 1.4453125, + "learning_rate": 3.5896543684504205e-07, + "long_answer_loss": 0.0726, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 2422, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0784, + "grad_norm": 1.3671875, + "learning_rate": 3.5529188654361e-07, + "long_answer_loss": 0.0784, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2423, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0727, + "grad_norm": 1.34375, + "learning_rate": 3.5163695907895477e-07, + "long_answer_loss": 0.0727, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2424, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0809, + "grad_norm": 1.359375, + "learning_rate": 3.480006600555849e-07, + "long_answer_loss": 0.0809, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2425, + "template_loss": 0.0 + }, + { + "epoch": 1.85, + "full_loss": 0.0754, + "grad_norm": 1.453125, + "learning_rate": 3.4438299504944563e-07, + "long_answer_loss": 0.0754, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2426, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0632, + "grad_norm": 1.5234375, + "learning_rate": 3.4078396960790656e-07, + "long_answer_loss": 0.0632, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 2427, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0809, + "grad_norm": 1.3984375, + "learning_rate": 3.37203589249753e-07, + "long_answer_loss": 0.0809, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 2428, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0755, + "grad_norm": 1.375, + "learning_rate": 3.3364185946518217e-07, + "long_answer_loss": 0.0755, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 2429, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.068, + "grad_norm": 1.359375, + "learning_rate": 3.3009878571579473e-07, + "long_answer_loss": 0.068, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2430, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0681, + "grad_norm": 1.3203125, + "learning_rate": 3.26574373434578e-07, + "long_answer_loss": 0.0681, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2431, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0864, + "grad_norm": 1.484375, + "learning_rate": 3.2306862802590904e-07, + "long_answer_loss": 0.0864, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2432, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0857, + "grad_norm": 1.3984375, + "learning_rate": 3.195815548655376e-07, + "long_answer_loss": 0.0857, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 2433, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0886, + "grad_norm": 1.328125, + "learning_rate": 3.1611315930058225e-07, + "long_answer_loss": 0.0886, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 2434, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0747, + "grad_norm": 1.390625, + "learning_rate": 3.126634466495207e-07, + "long_answer_loss": 0.0747, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2435, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0843, + "grad_norm": 1.3828125, + "learning_rate": 3.092324222021825e-07, + "long_answer_loss": 0.0843, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2436, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0685, + "grad_norm": 1.46875, + "learning_rate": 3.05820091219744e-07, + "long_answer_loss": 0.0685, + "loss": 0.0766, + "short_answer_loss": NaN, + "step": 2437, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0817, + "grad_norm": 1.484375, + "learning_rate": 3.0242645893470563e-07, + "long_answer_loss": 0.0817, + "loss": 0.0749, + "short_answer_loss": NaN, + "step": 2438, + "template_loss": 0.0 + }, + { + "epoch": 1.86, + "full_loss": 0.0703, + "grad_norm": 1.3671875, + "learning_rate": 2.990515305509117e-07, + "long_answer_loss": 0.0703, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 2439, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0671, + "grad_norm": 1.3984375, + "learning_rate": 2.9569531124350876e-07, + "long_answer_loss": 0.0671, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 2440, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0762, + "grad_norm": 1.4375, + "learning_rate": 2.923578061589688e-07, + "long_answer_loss": 0.0762, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 2441, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0845, + "grad_norm": 1.46875, + "learning_rate": 2.890390204150564e-07, + "long_answer_loss": 0.0845, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 2442, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0726, + "grad_norm": 1.3671875, + "learning_rate": 2.857389591008383e-07, + "long_answer_loss": 0.0726, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2443, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0583, + "grad_norm": 1.515625, + "learning_rate": 2.824576272766666e-07, + "long_answer_loss": 0.0583, + "loss": 0.0793, + "short_answer_loss": NaN, + "step": 2444, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0842, + "grad_norm": 1.40625, + "learning_rate": 2.791950299741747e-07, + "long_answer_loss": 0.0842, + "loss": 0.0704, + "short_answer_loss": NaN, + "step": 2445, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0642, + "grad_norm": 1.4140625, + "learning_rate": 2.7595117219626626e-07, + "long_answer_loss": 0.0642, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2446, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0612, + "grad_norm": 1.375, + "learning_rate": 2.727260589171096e-07, + "long_answer_loss": 0.0612, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 2447, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0754, + "grad_norm": 1.4453125, + "learning_rate": 2.6951969508213355e-07, + "long_answer_loss": 0.0754, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2448, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0708, + "grad_norm": 1.421875, + "learning_rate": 2.6633208560800927e-07, + "long_answer_loss": 0.0708, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2449, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0651, + "grad_norm": 1.421875, + "learning_rate": 2.631632353826602e-07, + "long_answer_loss": 0.0651, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2450, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0677, + "grad_norm": 1.453125, + "learning_rate": 2.600131492652341e-07, + "long_answer_loss": 0.0677, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 2451, + "template_loss": 0.0 + }, + { + "epoch": 1.87, + "full_loss": 0.0875, + "grad_norm": 1.328125, + "learning_rate": 2.568818320861102e-07, + "long_answer_loss": 0.0875, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2452, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0776, + "grad_norm": 1.3984375, + "learning_rate": 2.5376928864688927e-07, + "long_answer_loss": 0.0776, + "loss": 0.076, + "short_answer_loss": NaN, + "step": 2453, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.082, + "grad_norm": 1.484375, + "learning_rate": 2.50675523720377e-07, + "long_answer_loss": 0.082, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2454, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0796, + "grad_norm": 1.359375, + "learning_rate": 2.476005420505925e-07, + "long_answer_loss": 0.0796, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2455, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0673, + "grad_norm": 1.4140625, + "learning_rate": 2.4454434835274596e-07, + "long_answer_loss": 0.0673, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2456, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0753, + "grad_norm": 1.5, + "learning_rate": 2.4150694731324283e-07, + "long_answer_loss": 0.0753, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2457, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0778, + "grad_norm": 1.390625, + "learning_rate": 2.3848834358966705e-07, + "long_answer_loss": 0.0778, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 2458, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0652, + "grad_norm": 1.4453125, + "learning_rate": 2.3548854181078272e-07, + "long_answer_loss": 0.0652, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2459, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0699, + "grad_norm": 1.4140625, + "learning_rate": 2.3250754657651863e-07, + "long_answer_loss": 0.0699, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2460, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.071, + "grad_norm": 1.4765625, + "learning_rate": 2.2954536245796827e-07, + "long_answer_loss": 0.071, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 2461, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0692, + "grad_norm": 1.421875, + "learning_rate": 2.2660199399738014e-07, + "long_answer_loss": 0.0692, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2462, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0698, + "grad_norm": 1.390625, + "learning_rate": 2.2367744570814808e-07, + "long_answer_loss": 0.0698, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 2463, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0823, + "grad_norm": 1.34375, + "learning_rate": 2.2077172207481123e-07, + "long_answer_loss": 0.0823, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2464, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0684, + "grad_norm": 1.4296875, + "learning_rate": 2.1788482755303734e-07, + "long_answer_loss": 0.0684, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 2465, + "template_loss": 0.0 + }, + { + "epoch": 1.88, + "full_loss": 0.0684, + "grad_norm": 1.4140625, + "learning_rate": 2.1501676656962428e-07, + "long_answer_loss": 0.0684, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2466, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0703, + "grad_norm": 1.390625, + "learning_rate": 2.1216754352249151e-07, + "long_answer_loss": 0.0703, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2467, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0848, + "grad_norm": 1.359375, + "learning_rate": 2.093371627806706e-07, + "long_answer_loss": 0.0848, + "loss": 0.0742, + "short_answer_loss": NaN, + "step": 2468, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0688, + "grad_norm": 1.3984375, + "learning_rate": 2.0652562868429953e-07, + "long_answer_loss": 0.0688, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2469, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0767, + "grad_norm": 1.4140625, + "learning_rate": 2.0373294554461715e-07, + "long_answer_loss": 0.0767, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2470, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.074, + "grad_norm": 1.515625, + "learning_rate": 2.0095911764395764e-07, + "long_answer_loss": 0.074, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2471, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0531, + "grad_norm": 1.375, + "learning_rate": 1.9820414923574087e-07, + "long_answer_loss": 0.0531, + "loss": 0.0712, + "short_answer_loss": NaN, + "step": 2472, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0651, + "grad_norm": 1.4296875, + "learning_rate": 1.9546804454446676e-07, + "long_answer_loss": 0.0651, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2473, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0795, + "grad_norm": 1.3984375, + "learning_rate": 1.9275080776570976e-07, + "long_answer_loss": 0.0795, + "loss": 0.0705, + "short_answer_loss": NaN, + "step": 2474, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0687, + "grad_norm": 1.3984375, + "learning_rate": 1.9005244306611185e-07, + "long_answer_loss": 0.0687, + "loss": 0.0745, + "short_answer_loss": NaN, + "step": 2475, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0729, + "grad_norm": 1.4140625, + "learning_rate": 1.8737295458337855e-07, + "long_answer_loss": 0.0729, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2476, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.064, + "grad_norm": 1.3828125, + "learning_rate": 1.84712346426269e-07, + "long_answer_loss": 0.064, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2477, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0605, + "grad_norm": 1.328125, + "learning_rate": 1.8207062267458775e-07, + "long_answer_loss": 0.0605, + "loss": 0.0688, + "short_answer_loss": NaN, + "step": 2478, + "template_loss": 0.0 + }, + { + "epoch": 1.89, + "full_loss": 0.0823, + "grad_norm": 1.359375, + "learning_rate": 1.7944778737918748e-07, + "long_answer_loss": 0.0823, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2479, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0835, + "grad_norm": 1.484375, + "learning_rate": 1.7684384456195385e-07, + "long_answer_loss": 0.0835, + "loss": 0.0779, + "short_answer_loss": NaN, + "step": 2480, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.068, + "grad_norm": 1.4296875, + "learning_rate": 1.7425879821580394e-07, + "long_answer_loss": 0.068, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2481, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0701, + "grad_norm": 1.40625, + "learning_rate": 1.7169265230467525e-07, + "long_answer_loss": 0.0701, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 2482, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0749, + "grad_norm": 1.4375, + "learning_rate": 1.6914541076352847e-07, + "long_answer_loss": 0.0749, + "loss": 0.0776, + "short_answer_loss": NaN, + "step": 2483, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0687, + "grad_norm": 1.3828125, + "learning_rate": 1.6661707749833082e-07, + "long_answer_loss": 0.0687, + "loss": 0.0673, + "short_answer_loss": NaN, + "step": 2484, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0806, + "grad_norm": 1.3828125, + "learning_rate": 1.6410765638606023e-07, + "long_answer_loss": 0.0806, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2485, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0754, + "grad_norm": 1.375, + "learning_rate": 1.616171512746914e-07, + "long_answer_loss": 0.0754, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2486, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0975, + "grad_norm": 1.4375, + "learning_rate": 1.5914556598319307e-07, + "long_answer_loss": 0.0975, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 2487, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0758, + "grad_norm": 1.4375, + "learning_rate": 1.5669290430152388e-07, + "long_answer_loss": 0.0758, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2488, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0888, + "grad_norm": 1.453125, + "learning_rate": 1.5425916999062402e-07, + "long_answer_loss": 0.0888, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 2489, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.073, + "grad_norm": 1.4296875, + "learning_rate": 1.5184436678241097e-07, + "long_answer_loss": 0.073, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2490, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0751, + "grad_norm": 1.4765625, + "learning_rate": 1.4944849837976726e-07, + "long_answer_loss": 0.0751, + "loss": 0.0778, + "short_answer_loss": NaN, + "step": 2491, + "template_loss": 0.0 + }, + { + "epoch": 1.9, + "full_loss": 0.0552, + "grad_norm": 1.328125, + "learning_rate": 1.4707156845655267e-07, + "long_answer_loss": 0.0552, + "loss": 0.0731, + "short_answer_loss": NaN, + "step": 2492, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0773, + "grad_norm": 1.375, + "learning_rate": 1.447135806575725e-07, + "long_answer_loss": 0.0773, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 2493, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0629, + "grad_norm": 1.375, + "learning_rate": 1.4237453859859696e-07, + "long_answer_loss": 0.0629, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2494, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0669, + "grad_norm": 1.5859375, + "learning_rate": 1.4005444586633886e-07, + "long_answer_loss": 0.0669, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2495, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0683, + "grad_norm": 1.3515625, + "learning_rate": 1.377533060184552e-07, + "long_answer_loss": 0.0683, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 2496, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0654, + "grad_norm": 1.5078125, + "learning_rate": 1.3547112258354143e-07, + "long_answer_loss": 0.0654, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 2497, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0653, + "grad_norm": 1.3515625, + "learning_rate": 1.3320789906112186e-07, + "long_answer_loss": 0.0653, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 2498, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0696, + "grad_norm": 1.3203125, + "learning_rate": 1.309636389216537e-07, + "long_answer_loss": 0.0696, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2499, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0706, + "grad_norm": 1.4921875, + "learning_rate": 1.2873834560650778e-07, + "long_answer_loss": 0.0706, + "loss": 0.0761, + "short_answer_loss": NaN, + "step": 2500, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.068, + "grad_norm": 1.40625, + "learning_rate": 1.2653202252797815e-07, + "long_answer_loss": 0.068, + "loss": 0.0756, + "short_answer_loss": NaN, + "step": 2501, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.087, + "grad_norm": 1.5390625, + "learning_rate": 1.2434467306926405e-07, + "long_answer_loss": 0.087, + "loss": 0.0823, + "short_answer_loss": NaN, + "step": 2502, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0638, + "grad_norm": 1.453125, + "learning_rate": 1.2217630058447282e-07, + "long_answer_loss": 0.0638, + "loss": 0.0696, + "short_answer_loss": NaN, + "step": 2503, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0692, + "grad_norm": 1.3515625, + "learning_rate": 1.2002690839861276e-07, + "long_answer_loss": 0.0692, + "loss": 0.0695, + "short_answer_loss": NaN, + "step": 2504, + "template_loss": 0.0 + }, + { + "epoch": 1.91, + "full_loss": 0.0725, + "grad_norm": 1.359375, + "learning_rate": 1.1789649980758627e-07, + "long_answer_loss": 0.0725, + "loss": 0.0769, + "short_answer_loss": NaN, + "step": 2505, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0665, + "grad_norm": 1.359375, + "learning_rate": 1.1578507807818717e-07, + "long_answer_loss": 0.0665, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2506, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0819, + "grad_norm": 1.390625, + "learning_rate": 1.1369264644809363e-07, + "long_answer_loss": 0.0819, + "loss": 0.0767, + "short_answer_loss": NaN, + "step": 2507, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.071, + "grad_norm": 1.40625, + "learning_rate": 1.1161920812586546e-07, + "long_answer_loss": 0.071, + "loss": 0.0729, + "short_answer_loss": NaN, + "step": 2508, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0788, + "grad_norm": 1.4375, + "learning_rate": 1.0956476629093438e-07, + "long_answer_loss": 0.0788, + "loss": 0.0744, + "short_answer_loss": NaN, + "step": 2509, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0744, + "grad_norm": 1.390625, + "learning_rate": 1.0752932409360955e-07, + "long_answer_loss": 0.0744, + "loss": 0.0698, + "short_answer_loss": NaN, + "step": 2510, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0751, + "grad_norm": 1.390625, + "learning_rate": 1.055128846550596e-07, + "long_answer_loss": 0.0751, + "loss": 0.0727, + "short_answer_loss": NaN, + "step": 2511, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0758, + "grad_norm": 1.390625, + "learning_rate": 1.0351545106731669e-07, + "long_answer_loss": 0.0758, + "loss": 0.0757, + "short_answer_loss": NaN, + "step": 2512, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0892, + "grad_norm": 1.40625, + "learning_rate": 1.015370263932669e-07, + "long_answer_loss": 0.0892, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 2513, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0743, + "grad_norm": 1.484375, + "learning_rate": 9.957761366665292e-08, + "long_answer_loss": 0.0743, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2514, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0696, + "grad_norm": 1.40625, + "learning_rate": 9.763721589205882e-08, + "long_answer_loss": 0.0696, + "loss": 0.066, + "short_answer_loss": NaN, + "step": 2515, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.069, + "grad_norm": 1.4296875, + "learning_rate": 9.571583604491286e-08, + "long_answer_loss": 0.069, + "loss": 0.0772, + "short_answer_loss": NaN, + "step": 2516, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0715, + "grad_norm": 1.3203125, + "learning_rate": 9.381347707148325e-08, + "long_answer_loss": 0.0715, + "loss": 0.0709, + "short_answer_loss": NaN, + "step": 2517, + "template_loss": 0.0 + }, + { + "epoch": 1.92, + "full_loss": 0.0694, + "grad_norm": 1.421875, + "learning_rate": 9.193014188886712e-08, + "long_answer_loss": 0.0694, + "loss": 0.0692, + "short_answer_loss": NaN, + "step": 2518, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0599, + "grad_norm": 1.4296875, + "learning_rate": 9.006583338499463e-08, + "long_answer_loss": 0.0599, + "loss": 0.0707, + "short_answer_loss": NaN, + "step": 2519, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0666, + "grad_norm": 1.375, + "learning_rate": 8.822055441861515e-08, + "long_answer_loss": 0.0666, + "loss": 0.0699, + "short_answer_loss": NaN, + "step": 2520, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0722, + "grad_norm": 1.421875, + "learning_rate": 8.639430781930413e-08, + "long_answer_loss": 0.0722, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2521, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0703, + "grad_norm": 1.3515625, + "learning_rate": 8.458709638744788e-08, + "long_answer_loss": 0.0703, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2522, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0777, + "grad_norm": 1.421875, + "learning_rate": 8.279892289424635e-08, + "long_answer_loss": 0.0777, + "loss": 0.0792, + "short_answer_loss": NaN, + "step": 2523, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.072, + "grad_norm": 1.375, + "learning_rate": 8.102979008170474e-08, + "long_answer_loss": 0.072, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2524, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0821, + "grad_norm": 1.6015625, + "learning_rate": 7.927970066263085e-08, + "long_answer_loss": 0.0821, + "loss": 0.0775, + "short_answer_loss": NaN, + "step": 2525, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0658, + "grad_norm": 1.3671875, + "learning_rate": 7.754865732063493e-08, + "long_answer_loss": 0.0658, + "loss": 0.0671, + "short_answer_loss": NaN, + "step": 2526, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0581, + "grad_norm": 1.390625, + "learning_rate": 7.58366627101173e-08, + "long_answer_loss": 0.0581, + "loss": 0.0721, + "short_answer_loss": NaN, + "step": 2527, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0652, + "grad_norm": 1.421875, + "learning_rate": 7.41437194562697e-08, + "long_answer_loss": 0.0652, + "loss": 0.0677, + "short_answer_loss": NaN, + "step": 2528, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0712, + "grad_norm": 1.375, + "learning_rate": 7.246983015507247e-08, + "long_answer_loss": 0.0712, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2529, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0765, + "grad_norm": 1.3984375, + "learning_rate": 7.081499737328634e-08, + "long_answer_loss": 0.0765, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2530, + "template_loss": 0.0 + }, + { + "epoch": 1.93, + "full_loss": 0.0742, + "grad_norm": 1.421875, + "learning_rate": 6.917922364845092e-08, + "long_answer_loss": 0.0742, + "loss": 0.0702, + "short_answer_loss": NaN, + "step": 2531, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0863, + "grad_norm": 1.390625, + "learning_rate": 6.75625114888806e-08, + "long_answer_loss": 0.0863, + "loss": 0.0719, + "short_answer_loss": NaN, + "step": 2532, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0711, + "grad_norm": 1.4140625, + "learning_rate": 6.596486337366176e-08, + "long_answer_loss": 0.0711, + "loss": 0.0778, + "short_answer_loss": NaN, + "step": 2533, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0877, + "grad_norm": 1.5078125, + "learning_rate": 6.438628175264582e-08, + "long_answer_loss": 0.0877, + "loss": 0.0787, + "short_answer_loss": NaN, + "step": 2534, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0691, + "grad_norm": 1.3359375, + "learning_rate": 6.282676904644652e-08, + "long_answer_loss": 0.0691, + "loss": 0.0768, + "short_answer_loss": NaN, + "step": 2535, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.071, + "grad_norm": 1.421875, + "learning_rate": 6.12863276464426e-08, + "long_answer_loss": 0.071, + "loss": 0.0741, + "short_answer_loss": NaN, + "step": 2536, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0685, + "grad_norm": 1.453125, + "learning_rate": 5.976495991476121e-08, + "long_answer_loss": 0.0685, + "loss": 0.0718, + "short_answer_loss": NaN, + "step": 2537, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.07, + "grad_norm": 1.4296875, + "learning_rate": 5.826266818428766e-08, + "long_answer_loss": 0.07, + "loss": 0.0784, + "short_answer_loss": NaN, + "step": 2538, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0773, + "grad_norm": 1.359375, + "learning_rate": 5.6779454758652816e-08, + "long_answer_loss": 0.0773, + "loss": 0.0736, + "short_answer_loss": NaN, + "step": 2539, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0702, + "grad_norm": 1.375, + "learning_rate": 5.531532191223321e-08, + "long_answer_loss": 0.0702, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2540, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0625, + "grad_norm": 1.3203125, + "learning_rate": 5.3870271890146814e-08, + "long_answer_loss": 0.0625, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2541, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0771, + "grad_norm": 1.3515625, + "learning_rate": 5.244430690825031e-08, + "long_answer_loss": 0.0771, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 2542, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0726, + "grad_norm": 1.3515625, + "learning_rate": 5.103742915313764e-08, + "long_answer_loss": 0.0726, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2543, + "template_loss": 0.0 + }, + { + "epoch": 1.94, + "full_loss": 0.0756, + "grad_norm": 1.4453125, + "learning_rate": 4.964964078212619e-08, + "long_answer_loss": 0.0756, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2544, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0728, + "grad_norm": 1.3671875, + "learning_rate": 4.828094392327204e-08, + "long_answer_loss": 0.0728, + "loss": 0.071, + "short_answer_loss": NaN, + "step": 2545, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0792, + "grad_norm": 1.4140625, + "learning_rate": 4.6931340675347714e-08, + "long_answer_loss": 0.0792, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2546, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.05, + "grad_norm": 1.3671875, + "learning_rate": 4.560083310785196e-08, + "long_answer_loss": 0.05, + "loss": 0.0674, + "short_answer_loss": NaN, + "step": 2547, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.077, + "grad_norm": 1.3359375, + "learning_rate": 4.4289423260999994e-08, + "long_answer_loss": 0.077, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2548, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0671, + "grad_norm": 1.46875, + "learning_rate": 4.299711314572352e-08, + "long_answer_loss": 0.0671, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2549, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0806, + "grad_norm": 1.4375, + "learning_rate": 4.172390474366517e-08, + "long_answer_loss": 0.0806, + "loss": 0.0733, + "short_answer_loss": NaN, + "step": 2550, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0705, + "grad_norm": 1.34375, + "learning_rate": 4.0469800007177096e-08, + "long_answer_loss": 0.0705, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 2551, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0857, + "grad_norm": 1.3984375, + "learning_rate": 3.923480085931963e-08, + "long_answer_loss": 0.0857, + "loss": 0.0774, + "short_answer_loss": NaN, + "step": 2552, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0755, + "grad_norm": 1.359375, + "learning_rate": 3.8018909193854315e-08, + "long_answer_loss": 0.0755, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2553, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0735, + "grad_norm": 1.28125, + "learning_rate": 3.6822126875242504e-08, + "long_answer_loss": 0.0735, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 2554, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0694, + "grad_norm": 1.375, + "learning_rate": 3.564445573864539e-08, + "long_answer_loss": 0.0694, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2555, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0679, + "grad_norm": 1.46875, + "learning_rate": 3.448589758991705e-08, + "long_answer_loss": 0.0679, + "loss": 0.074, + "short_answer_loss": NaN, + "step": 2556, + "template_loss": 0.0 + }, + { + "epoch": 1.95, + "full_loss": 0.0666, + "grad_norm": 1.3984375, + "learning_rate": 3.334645420560445e-08, + "long_answer_loss": 0.0666, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2557, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0819, + "grad_norm": 1.3203125, + "learning_rate": 3.222612733294189e-08, + "long_answer_loss": 0.0819, + "loss": 0.0725, + "short_answer_loss": NaN, + "step": 2558, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0688, + "grad_norm": 1.3671875, + "learning_rate": 3.112491868985379e-08, + "long_answer_loss": 0.0688, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 2559, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0737, + "grad_norm": 1.3671875, + "learning_rate": 3.004282996494495e-08, + "long_answer_loss": 0.0737, + "loss": 0.0754, + "short_answer_loss": NaN, + "step": 2560, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0735, + "grad_norm": 1.53125, + "learning_rate": 2.8979862817503368e-08, + "long_answer_loss": 0.0735, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2561, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0896, + "grad_norm": 1.453125, + "learning_rate": 2.793601887749464e-08, + "long_answer_loss": 0.0896, + "loss": 0.0751, + "short_answer_loss": NaN, + "step": 2562, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0798, + "grad_norm": 1.4765625, + "learning_rate": 2.6911299745562e-08, + "long_answer_loss": 0.0798, + "loss": 0.0762, + "short_answer_loss": NaN, + "step": 2563, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0881, + "grad_norm": 1.359375, + "learning_rate": 2.590570699302214e-08, + "long_answer_loss": 0.0881, + "loss": 0.0666, + "short_answer_loss": NaN, + "step": 2564, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0644, + "grad_norm": 1.421875, + "learning_rate": 2.4919242161859646e-08, + "long_answer_loss": 0.0644, + "loss": 0.073, + "short_answer_loss": NaN, + "step": 2565, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0725, + "grad_norm": 1.375, + "learning_rate": 2.3951906764735353e-08, + "long_answer_loss": 0.0725, + "loss": 0.0693, + "short_answer_loss": NaN, + "step": 2566, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0635, + "grad_norm": 1.3828125, + "learning_rate": 2.3003702284969676e-08, + "long_answer_loss": 0.0635, + "loss": 0.0701, + "short_answer_loss": NaN, + "step": 2567, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0625, + "grad_norm": 1.3984375, + "learning_rate": 2.2074630176550927e-08, + "long_answer_loss": 0.0625, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2568, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.0781, + "grad_norm": 1.4765625, + "learning_rate": 2.1164691864129783e-08, + "long_answer_loss": 0.0781, + "loss": 0.0713, + "short_answer_loss": NaN, + "step": 2569, + "template_loss": 0.0 + }, + { + "epoch": 1.96, + "full_loss": 0.07, + "grad_norm": 1.34375, + "learning_rate": 2.02738887430165e-08, + "long_answer_loss": 0.07, + "loss": 0.0728, + "short_answer_loss": NaN, + "step": 2570, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0656, + "grad_norm": 1.3359375, + "learning_rate": 1.9402222179178142e-08, + "long_answer_loss": 0.0656, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2571, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0539, + "grad_norm": 1.5078125, + "learning_rate": 1.8549693509238576e-08, + "long_answer_loss": 0.0539, + "loss": 0.0737, + "short_answer_loss": NaN, + "step": 2572, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0661, + "grad_norm": 1.4140625, + "learning_rate": 1.7716304040475697e-08, + "long_answer_loss": 0.0661, + "loss": 0.0738, + "short_answer_loss": NaN, + "step": 2573, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.074, + "grad_norm": 1.3828125, + "learning_rate": 1.6902055050817268e-08, + "long_answer_loss": 0.074, + "loss": 0.0703, + "short_answer_loss": NaN, + "step": 2574, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0855, + "grad_norm": 3.296875, + "learning_rate": 1.6106947788845082e-08, + "long_answer_loss": 0.0855, + "loss": 0.0715, + "short_answer_loss": NaN, + "step": 2575, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.059, + "grad_norm": 1.3828125, + "learning_rate": 1.533098347378109e-08, + "long_answer_loss": 0.059, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2576, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0748, + "grad_norm": 1.3984375, + "learning_rate": 1.4574163295502652e-08, + "long_answer_loss": 0.0748, + "loss": 0.0681, + "short_answer_loss": NaN, + "step": 2577, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0721, + "grad_norm": 1.40625, + "learning_rate": 1.3836488414524507e-08, + "long_answer_loss": 0.0721, + "loss": 0.069, + "short_answer_loss": NaN, + "step": 2578, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0746, + "grad_norm": 1.390625, + "learning_rate": 1.3117959962005711e-08, + "long_answer_loss": 0.0746, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2579, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0715, + "grad_norm": 1.421875, + "learning_rate": 1.2418579039746859e-08, + "long_answer_loss": 0.0715, + "loss": 0.0726, + "short_answer_loss": NaN, + "step": 2580, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0867, + "grad_norm": 1.3671875, + "learning_rate": 1.1738346720185922e-08, + "long_answer_loss": 0.0867, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2581, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0794, + "grad_norm": 1.484375, + "learning_rate": 1.1077264046399638e-08, + "long_answer_loss": 0.0794, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2582, + "template_loss": 0.0 + }, + { + "epoch": 1.97, + "full_loss": 0.0711, + "grad_norm": 1.4453125, + "learning_rate": 1.0435332032100731e-08, + "long_answer_loss": 0.0711, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2583, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0633, + "grad_norm": 1.3828125, + "learning_rate": 9.812551661633751e-09, + "long_answer_loss": 0.0633, + "loss": 0.0708, + "short_answer_loss": NaN, + "step": 2584, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0695, + "grad_norm": 1.375, + "learning_rate": 9.208923889979237e-09, + "long_answer_loss": 0.0695, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 2585, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0898, + "grad_norm": 1.453125, + "learning_rate": 8.624449642745391e-09, + "long_answer_loss": 0.0898, + "loss": 0.0722, + "short_answer_loss": NaN, + "step": 2586, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0657, + "grad_norm": 1.3984375, + "learning_rate": 8.059129816170851e-09, + "long_answer_loss": 0.0657, + "loss": 0.0659, + "short_answer_loss": NaN, + "step": 2587, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0633, + "grad_norm": 1.3671875, + "learning_rate": 7.512965277126083e-09, + "long_answer_loss": 0.0633, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2588, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0744, + "grad_norm": 1.3828125, + "learning_rate": 6.985956863105047e-09, + "long_answer_loss": 0.0744, + "loss": 0.0735, + "short_answer_loss": NaN, + "step": 2589, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0649, + "grad_norm": 1.359375, + "learning_rate": 6.478105382229371e-09, + "long_answer_loss": 0.0649, + "loss": 0.0711, + "short_answer_loss": NaN, + "step": 2590, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0631, + "grad_norm": 1.421875, + "learning_rate": 5.989411613242791e-09, + "long_answer_loss": 0.0631, + "loss": 0.067, + "short_answer_loss": NaN, + "step": 2591, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0722, + "grad_norm": 1.3984375, + "learning_rate": 5.519876305515315e-09, + "long_answer_loss": 0.0722, + "loss": 0.0706, + "short_answer_loss": NaN, + "step": 2592, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0779, + "grad_norm": 1.4140625, + "learning_rate": 5.069500179036291e-09, + "long_answer_loss": 0.0779, + "loss": 0.0717, + "short_answer_loss": NaN, + "step": 2593, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.0775, + "grad_norm": 1.3828125, + "learning_rate": 4.63828392441995e-09, + "long_answer_loss": 0.0775, + "loss": 0.0732, + "short_answer_loss": NaN, + "step": 2594, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.099, + "grad_norm": 1.328125, + "learning_rate": 4.226228202897087e-09, + "long_answer_loss": 0.099, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 2595, + "template_loss": 0.0 + }, + { + "epoch": 1.98, + "full_loss": 0.055, + "grad_norm": 1.4296875, + "learning_rate": 3.833333646319215e-09, + "long_answer_loss": 0.055, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2596, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0773, + "grad_norm": 1.40625, + "learning_rate": 3.4596008571544102e-09, + "long_answer_loss": 0.0773, + "loss": 0.0758, + "short_answer_loss": NaN, + "step": 2597, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0877, + "grad_norm": 1.3203125, + "learning_rate": 3.105030408490084e-09, + "long_answer_loss": 0.0877, + "loss": 0.0697, + "short_answer_loss": NaN, + "step": 2598, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0814, + "grad_norm": 1.4296875, + "learning_rate": 2.7696228440274308e-09, + "long_answer_loss": 0.0814, + "loss": 0.0795, + "short_answer_loss": NaN, + "step": 2599, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0646, + "grad_norm": 1.3515625, + "learning_rate": 2.453378678085594e-09, + "long_answer_loss": 0.0646, + "loss": 0.0716, + "short_answer_loss": NaN, + "step": 2600, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0733, + "grad_norm": 1.3984375, + "learning_rate": 2.1562983955975003e-09, + "long_answer_loss": 0.0733, + "loss": 0.0691, + "short_answer_loss": NaN, + "step": 2601, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0927, + "grad_norm": 1.4375, + "learning_rate": 1.8783824521070857e-09, + "long_answer_loss": 0.0927, + "loss": 0.0753, + "short_answer_loss": NaN, + "step": 2602, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0754, + "grad_norm": 1.421875, + "learning_rate": 1.6196312737762342e-09, + "long_answer_loss": 0.0754, + "loss": 0.0723, + "short_answer_loss": NaN, + "step": 2603, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0783, + "grad_norm": 1.421875, + "learning_rate": 1.3800452573750623e-09, + "long_answer_loss": 0.0783, + "loss": 0.072, + "short_answer_loss": NaN, + "step": 2604, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0784, + "grad_norm": 1.3671875, + "learning_rate": 1.1596247702888584e-09, + "long_answer_loss": 0.0784, + "loss": 0.0687, + "short_answer_loss": NaN, + "step": 2605, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0873, + "grad_norm": 1.421875, + "learning_rate": 9.583701505139208e-10, + "long_answer_loss": 0.0873, + "loss": 0.0739, + "short_answer_loss": NaN, + "step": 2606, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0656, + "grad_norm": 1.59375, + "learning_rate": 7.762817066533923e-10, + "long_answer_loss": 0.0656, + "loss": 0.0734, + "short_answer_loss": NaN, + "step": 2607, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0654, + "grad_norm": 1.3515625, + "learning_rate": 6.133597179269757e-10, + "long_answer_loss": 0.0654, + "loss": 0.0684, + "short_answer_loss": NaN, + "step": 2608, + "template_loss": 0.0 + }, + { + "epoch": 1.99, + "full_loss": 0.0674, + "grad_norm": 1.4296875, + "learning_rate": 4.696044341598315e-10, + "long_answer_loss": 0.0674, + "loss": 0.0672, + "short_answer_loss": NaN, + "step": 2609, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0789, + "grad_norm": 1.3984375, + "learning_rate": 3.450160757881293e-10, + "long_answer_loss": 0.0789, + "loss": 0.0689, + "short_answer_loss": NaN, + "step": 2610, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0605, + "grad_norm": 1.359375, + "learning_rate": 2.3959483385627146e-10, + "long_answer_loss": 0.0605, + "loss": 0.0675, + "short_answer_loss": NaN, + "step": 2611, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0674, + "grad_norm": 1.390625, + "learning_rate": 1.5334087001828145e-10, + "long_answer_loss": 0.0674, + "loss": 0.07, + "short_answer_loss": NaN, + "step": 2612, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0619, + "grad_norm": 1.421875, + "learning_rate": 8.625431653919158e-11, + "long_answer_loss": 0.0619, + "loss": 0.0748, + "short_answer_loss": NaN, + "step": 2613, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0675, + "grad_norm": 1.5078125, + "learning_rate": 3.833527628810396e-11, + "long_answer_loss": 0.0675, + "loss": 0.0763, + "short_answer_loss": NaN, + "step": 2614, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0748, + "grad_norm": 1.6015625, + "learning_rate": 9.583822746517257e-12, + "long_answer_loss": 0.0748, + "loss": 0.0747, + "short_answer_loss": NaN, + "step": 2615, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "full_loss": 0.0656, + "grad_norm": 1.34375, + "learning_rate": 0.0, + "long_answer_loss": 0.0656, + "loss": 0.0676, + "short_answer_loss": NaN, + "step": 2616, + "template_loss": 0.0 + }, + { + "epoch": 2.0, + "step": 2616, + "total_flos": 3.5412472154040566e+18, + "train_loss": 0.1190942916392551, + "train_runtime": 17303.3718, + "train_samples_per_second": 19.355, + "train_steps_per_second": 0.151 + } + ], + "logging_steps": 1.0, + "max_steps": 2616, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1.0, + "total_flos": 3.5412472154040566e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}