|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.6470588235294117, |
|
"eval_steps": 500, |
|
"global_step": 90000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014705882352941176, |
|
"grad_norm": 0.35731062293052673, |
|
"learning_rate": 4.975490196078432e-05, |
|
"loss": 0.5354, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.029411764705882353, |
|
"grad_norm": 2.1595866680145264, |
|
"learning_rate": 4.9509803921568634e-05, |
|
"loss": 0.1627, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04411764705882353, |
|
"grad_norm": 0.14825384318828583, |
|
"learning_rate": 4.9264705882352944e-05, |
|
"loss": 0.101, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 0.07121703773736954, |
|
"learning_rate": 4.901960784313725e-05, |
|
"loss": 0.0952, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 0.06089532747864723, |
|
"learning_rate": 4.877450980392157e-05, |
|
"loss": 0.0881, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.08823529411764706, |
|
"grad_norm": 0.037034619599580765, |
|
"learning_rate": 4.8529411764705885e-05, |
|
"loss": 0.0717, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10294117647058823, |
|
"grad_norm": 0.01999847963452339, |
|
"learning_rate": 4.82843137254902e-05, |
|
"loss": 0.0848, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.018650399520993233, |
|
"learning_rate": 4.803921568627452e-05, |
|
"loss": 0.068, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1323529411764706, |
|
"grad_norm": 0.023023229092359543, |
|
"learning_rate": 4.7794117647058826e-05, |
|
"loss": 0.0626, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.018715515732765198, |
|
"learning_rate": 4.7549019607843135e-05, |
|
"loss": 0.0555, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16176470588235295, |
|
"grad_norm": 0.11842140555381775, |
|
"learning_rate": 4.730392156862745e-05, |
|
"loss": 0.0658, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.17647058823529413, |
|
"grad_norm": 0.04816881939768791, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.0648, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.19117647058823528, |
|
"grad_norm": 0.025240018963813782, |
|
"learning_rate": 4.681372549019608e-05, |
|
"loss": 0.0632, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.20588235294117646, |
|
"grad_norm": 0.007390766404569149, |
|
"learning_rate": 4.656862745098039e-05, |
|
"loss": 0.0705, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 0.011664963327348232, |
|
"learning_rate": 4.632352941176471e-05, |
|
"loss": 0.0552, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 39.69175720214844, |
|
"learning_rate": 4.607843137254902e-05, |
|
"loss": 0.0686, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.007366931065917015, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 0.061, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2647058823529412, |
|
"grad_norm": 0.003396671498194337, |
|
"learning_rate": 4.558823529411765e-05, |
|
"loss": 0.0464, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.27941176470588236, |
|
"grad_norm": 0.045366521924734116, |
|
"learning_rate": 4.5343137254901966e-05, |
|
"loss": 0.053, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 1.137495756149292, |
|
"learning_rate": 4.5098039215686275e-05, |
|
"loss": 0.0484, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3088235294117647, |
|
"grad_norm": 0.0062417215667665005, |
|
"learning_rate": 4.485294117647059e-05, |
|
"loss": 0.0564, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.3235294117647059, |
|
"grad_norm": 0.003990447614341974, |
|
"learning_rate": 4.460784313725491e-05, |
|
"loss": 0.0364, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3382352941176471, |
|
"grad_norm": 0.11569799482822418, |
|
"learning_rate": 4.4362745098039216e-05, |
|
"loss": 0.0436, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.01159907691180706, |
|
"learning_rate": 4.411764705882353e-05, |
|
"loss": 0.0456, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 10.926911354064941, |
|
"learning_rate": 4.387254901960784e-05, |
|
"loss": 0.0464, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.38235294117647056, |
|
"grad_norm": 0.03444543853402138, |
|
"learning_rate": 4.362745098039216e-05, |
|
"loss": 0.0512, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.39705882352941174, |
|
"grad_norm": 0.002395658055320382, |
|
"learning_rate": 4.3382352941176474e-05, |
|
"loss": 0.0586, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.4117647058823529, |
|
"grad_norm": 0.010224021971225739, |
|
"learning_rate": 4.313725490196079e-05, |
|
"loss": 0.0407, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.4264705882352941, |
|
"grad_norm": 0.010642035864293575, |
|
"learning_rate": 4.28921568627451e-05, |
|
"loss": 0.0536, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.00970557238906622, |
|
"learning_rate": 4.2647058823529415e-05, |
|
"loss": 0.0526, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.45588235294117646, |
|
"grad_norm": 0.0030253385193645954, |
|
"learning_rate": 4.2401960784313724e-05, |
|
"loss": 0.0432, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.5423064827919006, |
|
"learning_rate": 4.215686274509804e-05, |
|
"loss": 0.0381, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4852941176470588, |
|
"grad_norm": 7.454125881195068, |
|
"learning_rate": 4.1911764705882356e-05, |
|
"loss": 0.0397, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.87744402885437, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.0564, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 33.39067840576172, |
|
"learning_rate": 4.142156862745099e-05, |
|
"loss": 0.0455, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5294117647058824, |
|
"grad_norm": 0.008246080949902534, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 0.0515, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5441176470588235, |
|
"grad_norm": 0.018734918907284737, |
|
"learning_rate": 4.0931372549019607e-05, |
|
"loss": 0.0453, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.5588235294117647, |
|
"grad_norm": 0.007373027969151735, |
|
"learning_rate": 4.068627450980392e-05, |
|
"loss": 0.0485, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5735294117647058, |
|
"grad_norm": 0.0023267469368875027, |
|
"learning_rate": 4.044117647058824e-05, |
|
"loss": 0.0372, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.1311252862215042, |
|
"learning_rate": 4.0196078431372555e-05, |
|
"loss": 0.0442, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6029411764705882, |
|
"grad_norm": 0.003710985416546464, |
|
"learning_rate": 3.9950980392156864e-05, |
|
"loss": 0.0585, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.6176470588235294, |
|
"grad_norm": 0.004861747846007347, |
|
"learning_rate": 3.970588235294117e-05, |
|
"loss": 0.0374, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6323529411764706, |
|
"grad_norm": 0.004351571202278137, |
|
"learning_rate": 3.946078431372549e-05, |
|
"loss": 0.0265, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.6470588235294118, |
|
"grad_norm": 0.008851751685142517, |
|
"learning_rate": 3.9215686274509805e-05, |
|
"loss": 0.0314, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 0.0046307104639709, |
|
"learning_rate": 3.897058823529412e-05, |
|
"loss": 0.0445, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.6764705882352942, |
|
"grad_norm": 0.002735880669206381, |
|
"learning_rate": 3.872549019607844e-05, |
|
"loss": 0.0433, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6911764705882353, |
|
"grad_norm": 0.0218490082770586, |
|
"learning_rate": 3.8480392156862746e-05, |
|
"loss": 0.0444, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 2.371448278427124, |
|
"learning_rate": 3.8235294117647055e-05, |
|
"loss": 0.033, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.7205882352941176, |
|
"grad_norm": 0.005015780217945576, |
|
"learning_rate": 3.799019607843137e-05, |
|
"loss": 0.0422, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 1.3102850914001465, |
|
"learning_rate": 3.774509803921569e-05, |
|
"loss": 0.035, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.004522955510765314, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0303, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.7647058823529411, |
|
"grad_norm": 0.006077844649553299, |
|
"learning_rate": 3.725490196078432e-05, |
|
"loss": 0.0503, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.7794117647058824, |
|
"grad_norm": 0.0026446939446032047, |
|
"learning_rate": 3.700980392156863e-05, |
|
"loss": 0.0264, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.7941176470588235, |
|
"grad_norm": 0.004485867917537689, |
|
"learning_rate": 3.6764705882352945e-05, |
|
"loss": 0.0388, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 0.0073866695165634155, |
|
"learning_rate": 3.6519607843137254e-05, |
|
"loss": 0.0283, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.012984287925064564, |
|
"learning_rate": 3.627450980392157e-05, |
|
"loss": 0.0393, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.8382352941176471, |
|
"grad_norm": 0.01751883700489998, |
|
"learning_rate": 3.6029411764705886e-05, |
|
"loss": 0.0388, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.8529411764705882, |
|
"grad_norm": 0.0035843336954712868, |
|
"learning_rate": 3.5784313725490195e-05, |
|
"loss": 0.0384, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.8676470588235294, |
|
"grad_norm": 0.0645672082901001, |
|
"learning_rate": 3.553921568627451e-05, |
|
"loss": 0.0378, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.008566264994442463, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.0315, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8970588235294118, |
|
"grad_norm": 0.010571740567684174, |
|
"learning_rate": 3.5049019607843136e-05, |
|
"loss": 0.0324, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.9117647058823529, |
|
"grad_norm": 0.0022533361334353685, |
|
"learning_rate": 3.480392156862745e-05, |
|
"loss": 0.0292, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.9264705882352942, |
|
"grad_norm": 0.006164130289107561, |
|
"learning_rate": 3.455882352941177e-05, |
|
"loss": 0.0407, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.007435985840857029, |
|
"learning_rate": 3.431372549019608e-05, |
|
"loss": 0.0449, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 0.003777585458010435, |
|
"learning_rate": 3.4068627450980394e-05, |
|
"loss": 0.0247, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.9705882352941176, |
|
"grad_norm": 0.005975374951958656, |
|
"learning_rate": 3.382352941176471e-05, |
|
"loss": 0.0309, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9852941176470589, |
|
"grad_norm": 0.0016012012492865324, |
|
"learning_rate": 3.357843137254902e-05, |
|
"loss": 0.0294, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.020027656108140945, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.022, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.0147058823529411, |
|
"grad_norm": 10.916418075561523, |
|
"learning_rate": 3.308823529411765e-05, |
|
"loss": 0.0264, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.013728056102991104, |
|
"learning_rate": 3.284313725490196e-05, |
|
"loss": 0.032, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.0441176470588236, |
|
"grad_norm": 0.02546643279492855, |
|
"learning_rate": 3.2598039215686276e-05, |
|
"loss": 0.0404, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.0588235294117647, |
|
"grad_norm": 0.004185052588582039, |
|
"learning_rate": 3.235294117647059e-05, |
|
"loss": 0.0306, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.0735294117647058, |
|
"grad_norm": 0.016653403639793396, |
|
"learning_rate": 3.210784313725491e-05, |
|
"loss": 0.0217, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.088235294117647, |
|
"grad_norm": 0.0028331661596894264, |
|
"learning_rate": 3.186274509803922e-05, |
|
"loss": 0.0265, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.1029411764705883, |
|
"grad_norm": 0.013931985944509506, |
|
"learning_rate": 3.161764705882353e-05, |
|
"loss": 0.0399, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.1176470588235294, |
|
"grad_norm": 0.001668413169682026, |
|
"learning_rate": 3.137254901960784e-05, |
|
"loss": 0.0202, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.1323529411764706, |
|
"grad_norm": 0.00567347789183259, |
|
"learning_rate": 3.112745098039216e-05, |
|
"loss": 0.0336, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.1470588235294117, |
|
"grad_norm": 0.003477458842098713, |
|
"learning_rate": 3.0882352941176475e-05, |
|
"loss": 0.0253, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.161764705882353, |
|
"grad_norm": 0.21819466352462769, |
|
"learning_rate": 3.063725490196079e-05, |
|
"loss": 0.0293, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.004708552733063698, |
|
"learning_rate": 3.0392156862745097e-05, |
|
"loss": 0.0268, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.1911764705882353, |
|
"grad_norm": 0.0018363581039011478, |
|
"learning_rate": 3.0147058823529413e-05, |
|
"loss": 0.0266, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.2058823529411764, |
|
"grad_norm": 0.005581580102443695, |
|
"learning_rate": 2.9901960784313725e-05, |
|
"loss": 0.0361, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.2205882352941178, |
|
"grad_norm": 0.0035322746261954308, |
|
"learning_rate": 2.965686274509804e-05, |
|
"loss": 0.0274, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.2352941176470589, |
|
"grad_norm": 0.0038708180654793978, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 0.0241, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.0019606975838541985, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 0.0219, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.2647058823529411, |
|
"grad_norm": 0.00241417670622468, |
|
"learning_rate": 2.8921568627450986e-05, |
|
"loss": 0.0244, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.2794117647058822, |
|
"grad_norm": 0.003593308152630925, |
|
"learning_rate": 2.8676470588235295e-05, |
|
"loss": 0.027, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.2941176470588236, |
|
"grad_norm": 1.4003372192382812, |
|
"learning_rate": 2.8431372549019608e-05, |
|
"loss": 0.0392, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.3088235294117647, |
|
"grad_norm": 0.41921645402908325, |
|
"learning_rate": 2.8186274509803924e-05, |
|
"loss": 0.0299, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 1.567896842956543, |
|
"learning_rate": 2.7941176470588236e-05, |
|
"loss": 0.0282, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.3382352941176472, |
|
"grad_norm": 0.003143745008856058, |
|
"learning_rate": 2.7696078431372552e-05, |
|
"loss": 0.0197, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.0016428233357146382, |
|
"learning_rate": 2.7450980392156865e-05, |
|
"loss": 0.0344, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.3676470588235294, |
|
"grad_norm": 0.005276167765259743, |
|
"learning_rate": 2.7205882352941174e-05, |
|
"loss": 0.0312, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.3823529411764706, |
|
"grad_norm": 0.004461635369807482, |
|
"learning_rate": 2.696078431372549e-05, |
|
"loss": 0.025, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.3970588235294117, |
|
"grad_norm": 0.003238542238250375, |
|
"learning_rate": 2.6715686274509806e-05, |
|
"loss": 0.0236, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 0.0017602238804101944, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 0.0234, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.4264705882352942, |
|
"grad_norm": 0.002768098609521985, |
|
"learning_rate": 2.6225490196078435e-05, |
|
"loss": 0.026, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.4411764705882353, |
|
"grad_norm": 0.002579926745966077, |
|
"learning_rate": 2.5980392156862747e-05, |
|
"loss": 0.0167, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.4558823529411764, |
|
"grad_norm": 0.008888750337064266, |
|
"learning_rate": 2.5735294117647057e-05, |
|
"loss": 0.0288, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.0038515792693942785, |
|
"learning_rate": 2.5490196078431373e-05, |
|
"loss": 0.0252, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.4852941176470589, |
|
"grad_norm": 4.428643226623535, |
|
"learning_rate": 2.5245098039215685e-05, |
|
"loss": 0.016, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.002629584399983287, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0214, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.5147058823529411, |
|
"grad_norm": 0.004097859375178814, |
|
"learning_rate": 2.4754901960784317e-05, |
|
"loss": 0.0183, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.5294117647058822, |
|
"grad_norm": 0.00568019924685359, |
|
"learning_rate": 2.4509803921568626e-05, |
|
"loss": 0.0271, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.5441176470588234, |
|
"grad_norm": 0.0020534582436084747, |
|
"learning_rate": 2.4264705882352942e-05, |
|
"loss": 0.0281, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.5588235294117647, |
|
"grad_norm": 0.003300599753856659, |
|
"learning_rate": 2.401960784313726e-05, |
|
"loss": 0.0284, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.5735294117647058, |
|
"grad_norm": 0.001472037984058261, |
|
"learning_rate": 2.3774509803921568e-05, |
|
"loss": 0.0205, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.5882352941176472, |
|
"grad_norm": 0.002835978288203478, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 0.0234, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.6029411764705883, |
|
"grad_norm": 0.003979724366217852, |
|
"learning_rate": 2.3284313725490196e-05, |
|
"loss": 0.0233, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.0036057273391634226, |
|
"learning_rate": 2.303921568627451e-05, |
|
"loss": 0.0319, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.6323529411764706, |
|
"grad_norm": 0.0024822901468724012, |
|
"learning_rate": 2.2794117647058825e-05, |
|
"loss": 0.0211, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 0.008930359967052937, |
|
"learning_rate": 2.2549019607843138e-05, |
|
"loss": 0.0309, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.6617647058823528, |
|
"grad_norm": 1.3381928205490112, |
|
"learning_rate": 2.2303921568627454e-05, |
|
"loss": 0.019, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.6764705882352942, |
|
"grad_norm": 0.004481327719986439, |
|
"learning_rate": 2.2058823529411766e-05, |
|
"loss": 0.0228, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.6911764705882353, |
|
"grad_norm": 0.012597435154020786, |
|
"learning_rate": 2.181372549019608e-05, |
|
"loss": 0.0285, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 0.0034783107694238424, |
|
"learning_rate": 2.1568627450980395e-05, |
|
"loss": 0.0165, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.7205882352941178, |
|
"grad_norm": 0.0032083301339298487, |
|
"learning_rate": 2.1323529411764707e-05, |
|
"loss": 0.0306, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.7352941176470589, |
|
"grad_norm": 0.0039000194519758224, |
|
"learning_rate": 2.107843137254902e-05, |
|
"loss": 0.0199, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.0034919639583677053, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.034, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.004928025882691145, |
|
"learning_rate": 2.058823529411765e-05, |
|
"loss": 0.0213, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.7794117647058822, |
|
"grad_norm": 0.003211489412933588, |
|
"learning_rate": 2.034313725490196e-05, |
|
"loss": 0.0206, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.7941176470588234, |
|
"grad_norm": 0.011239697225391865, |
|
"learning_rate": 2.0098039215686277e-05, |
|
"loss": 0.0201, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.8088235294117647, |
|
"grad_norm": 0.0024609589017927647, |
|
"learning_rate": 1.9852941176470586e-05, |
|
"loss": 0.0286, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.8235294117647058, |
|
"grad_norm": 0.0046806493774056435, |
|
"learning_rate": 1.9607843137254903e-05, |
|
"loss": 0.0216, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.8382352941176472, |
|
"grad_norm": 2.30717396736145, |
|
"learning_rate": 1.936274509803922e-05, |
|
"loss": 0.0246, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.8529411764705883, |
|
"grad_norm": 0.00428669573739171, |
|
"learning_rate": 1.9117647058823528e-05, |
|
"loss": 0.0166, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.8676470588235294, |
|
"grad_norm": 0.011403021402657032, |
|
"learning_rate": 1.8872549019607844e-05, |
|
"loss": 0.0272, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 0.0065813250839710236, |
|
"learning_rate": 1.862745098039216e-05, |
|
"loss": 0.0217, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.8970588235294117, |
|
"grad_norm": 0.0024323465768247843, |
|
"learning_rate": 1.8382352941176472e-05, |
|
"loss": 0.0176, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.001616469700820744, |
|
"learning_rate": 1.8137254901960785e-05, |
|
"loss": 0.0169, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.9264705882352942, |
|
"grad_norm": 0.004322522785514593, |
|
"learning_rate": 1.7892156862745098e-05, |
|
"loss": 0.0216, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 0.0024695000611245632, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.032, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.9558823529411766, |
|
"grad_norm": 0.010675052180886269, |
|
"learning_rate": 1.7401960784313726e-05, |
|
"loss": 0.023, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.9705882352941178, |
|
"grad_norm": 0.49467232823371887, |
|
"learning_rate": 1.715686274509804e-05, |
|
"loss": 0.0169, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.9852941176470589, |
|
"grad_norm": 0.01075649168342352, |
|
"learning_rate": 1.6911764705882355e-05, |
|
"loss": 0.0177, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.026703685522079468, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0259, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.014705882352941, |
|
"grad_norm": 0.0023890878073871136, |
|
"learning_rate": 1.642156862745098e-05, |
|
"loss": 0.0128, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.0294117647058822, |
|
"grad_norm": 0.0015898487763479352, |
|
"learning_rate": 1.6176470588235296e-05, |
|
"loss": 0.0112, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.0441176470588234, |
|
"grad_norm": 0.0041648312471807, |
|
"learning_rate": 1.593137254901961e-05, |
|
"loss": 0.0155, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 0.10951696336269379, |
|
"learning_rate": 1.568627450980392e-05, |
|
"loss": 0.0225, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.073529411764706, |
|
"grad_norm": 0.0021414640359580517, |
|
"learning_rate": 1.5441176470588237e-05, |
|
"loss": 0.0174, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.088235294117647, |
|
"grad_norm": 0.001627126126550138, |
|
"learning_rate": 1.5196078431372548e-05, |
|
"loss": 0.0154, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.1029411764705883, |
|
"grad_norm": 0.005821748171001673, |
|
"learning_rate": 1.4950980392156863e-05, |
|
"loss": 0.0255, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.0071876379661262035, |
|
"learning_rate": 1.4705882352941177e-05, |
|
"loss": 0.024, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.1323529411764706, |
|
"grad_norm": 0.00294076488353312, |
|
"learning_rate": 1.4460784313725493e-05, |
|
"loss": 0.012, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.1470588235294117, |
|
"grad_norm": 0.004260234069079161, |
|
"learning_rate": 1.4215686274509804e-05, |
|
"loss": 0.0202, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.161764705882353, |
|
"grad_norm": 0.005722737871110439, |
|
"learning_rate": 1.3970588235294118e-05, |
|
"loss": 0.0283, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.176470588235294, |
|
"grad_norm": 0.016162721440196037, |
|
"learning_rate": 1.3725490196078432e-05, |
|
"loss": 0.0115, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.1911764705882355, |
|
"grad_norm": 0.005048350431025028, |
|
"learning_rate": 1.3480392156862745e-05, |
|
"loss": 0.023, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 0.004102902952581644, |
|
"learning_rate": 1.323529411764706e-05, |
|
"loss": 0.018, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.2205882352941178, |
|
"grad_norm": 0.0021536860149353743, |
|
"learning_rate": 1.2990196078431374e-05, |
|
"loss": 0.0169, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 0.001363063813187182, |
|
"learning_rate": 1.2745098039215686e-05, |
|
"loss": 0.0129, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.0025944672524929047, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0095, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 2.264705882352941, |
|
"grad_norm": 0.001098868204280734, |
|
"learning_rate": 1.2254901960784313e-05, |
|
"loss": 0.0183, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.2794117647058822, |
|
"grad_norm": 0.00421817135065794, |
|
"learning_rate": 1.200980392156863e-05, |
|
"loss": 0.0214, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 2.2941176470588234, |
|
"grad_norm": 0.0016445108922198415, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.009, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.3088235294117645, |
|
"grad_norm": 0.002265740418806672, |
|
"learning_rate": 1.1519607843137254e-05, |
|
"loss": 0.0161, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 2.323529411764706, |
|
"grad_norm": 0.007099386304616928, |
|
"learning_rate": 1.1274509803921569e-05, |
|
"loss": 0.0143, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.338235294117647, |
|
"grad_norm": 0.003914414439350367, |
|
"learning_rate": 1.1029411764705883e-05, |
|
"loss": 0.0115, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.002513893647119403, |
|
"learning_rate": 1.0784313725490197e-05, |
|
"loss": 0.0136, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.3676470588235294, |
|
"grad_norm": 0.0027356306090950966, |
|
"learning_rate": 1.053921568627451e-05, |
|
"loss": 0.0247, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.3823529411764706, |
|
"grad_norm": 0.0817839726805687, |
|
"learning_rate": 1.0294117647058824e-05, |
|
"loss": 0.0169, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.3970588235294117, |
|
"grad_norm": 0.08163878321647644, |
|
"learning_rate": 1.0049019607843139e-05, |
|
"loss": 0.0114, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.411764705882353, |
|
"grad_norm": 0.002625884721055627, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 0.0147, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.426470588235294, |
|
"grad_norm": 0.004531237296760082, |
|
"learning_rate": 9.558823529411764e-06, |
|
"loss": 0.0106, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.4411764705882355, |
|
"grad_norm": 0.0032483581453561783, |
|
"learning_rate": 9.31372549019608e-06, |
|
"loss": 0.0183, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.4558823529411766, |
|
"grad_norm": 0.0050782193429768085, |
|
"learning_rate": 9.068627450980392e-06, |
|
"loss": 0.0168, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 0.003839900717139244, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.0209, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.485294117647059, |
|
"grad_norm": 0.008910595439374447, |
|
"learning_rate": 8.57843137254902e-06, |
|
"loss": 0.0271, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.008313077501952648, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.0175, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.514705882352941, |
|
"grad_norm": 0.0029984668362885714, |
|
"learning_rate": 8.088235294117648e-06, |
|
"loss": 0.0084, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.5294117647058822, |
|
"grad_norm": 0.004524989053606987, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 0.0156, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.5441176470588234, |
|
"grad_norm": 0.0023315059952437878, |
|
"learning_rate": 7.598039215686274e-06, |
|
"loss": 0.0121, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.5588235294117645, |
|
"grad_norm": 0.0010077670449391007, |
|
"learning_rate": 7.3529411764705884e-06, |
|
"loss": 0.0113, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.5735294117647056, |
|
"grad_norm": 0.0012684785760939121, |
|
"learning_rate": 7.107843137254902e-06, |
|
"loss": 0.0204, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 0.004918810911476612, |
|
"learning_rate": 6.862745098039216e-06, |
|
"loss": 0.0059, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.6029411764705883, |
|
"grad_norm": 0.011489451862871647, |
|
"learning_rate": 6.61764705882353e-06, |
|
"loss": 0.0273, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 2.6176470588235294, |
|
"grad_norm": 0.0028730102349072695, |
|
"learning_rate": 6.372549019607843e-06, |
|
"loss": 0.0277, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.6323529411764706, |
|
"grad_norm": 0.005050596781075001, |
|
"learning_rate": 6.127450980392157e-06, |
|
"loss": 0.0148, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.0028206182178109884, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.0121, |
|
"step": 90000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 102000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 5.579973324914688e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|