|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 47.61904761904762, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.32970941066741943, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.7065, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.34144940972328186, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.5961, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.3820774257183075, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.5497, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.359970360994339, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.4212, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.3855084478855133, |
|
"learning_rate": 0.00016, |
|
"loss": 1.3231, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.28801724314689636, |
|
"learning_rate": 0.000192, |
|
"loss": 1.2461, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.34446007013320923, |
|
"learning_rate": 0.00019733333333333335, |
|
"loss": 1.1191, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 0.3313669264316559, |
|
"learning_rate": 0.0001937777777777778, |
|
"loss": 1.119, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 0.2997862696647644, |
|
"learning_rate": 0.00019022222222222224, |
|
"loss": 1.0179, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 0.2927868962287903, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 0.9294, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.303634911775589, |
|
"learning_rate": 0.00018311111111111113, |
|
"loss": 0.8856, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.4033955931663513, |
|
"learning_rate": 0.00017955555555555558, |
|
"loss": 0.86, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 0.42172765731811523, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 0.7868, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 10.67, |
|
"grad_norm": 0.3995470702648163, |
|
"learning_rate": 0.00017244444444444444, |
|
"loss": 0.7428, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 11.43, |
|
"grad_norm": 0.4886798560619354, |
|
"learning_rate": 0.00016888888888888889, |
|
"loss": 0.6497, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 12.19, |
|
"grad_norm": 0.8596562743186951, |
|
"learning_rate": 0.00016533333333333333, |
|
"loss": 0.6389, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 12.95, |
|
"grad_norm": 5.950181007385254, |
|
"learning_rate": 0.00016177777777777778, |
|
"loss": 0.525, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 2.282550096511841, |
|
"learning_rate": 0.00015822222222222222, |
|
"loss": 0.447, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 14.48, |
|
"grad_norm": 0.877649188041687, |
|
"learning_rate": 0.00015466666666666667, |
|
"loss": 0.4354, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 15.24, |
|
"grad_norm": 0.941230833530426, |
|
"learning_rate": 0.0001511111111111111, |
|
"loss": 0.3089, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.8986572027206421, |
|
"learning_rate": 0.00014755555555555556, |
|
"loss": 0.3118, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 16.76, |
|
"grad_norm": 0.9937779307365417, |
|
"learning_rate": 0.000144, |
|
"loss": 0.2487, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 17.52, |
|
"grad_norm": 0.9299382567405701, |
|
"learning_rate": 0.00014044444444444445, |
|
"loss": 0.2026, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 18.29, |
|
"grad_norm": 1.0898679494857788, |
|
"learning_rate": 0.0001368888888888889, |
|
"loss": 0.1691, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 19.05, |
|
"grad_norm": 1.1475147008895874, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.1497, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 19.81, |
|
"grad_norm": 1.1220810413360596, |
|
"learning_rate": 0.00012977777777777779, |
|
"loss": 0.0984, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 20.57, |
|
"grad_norm": 1.19789457321167, |
|
"learning_rate": 0.00012622222222222223, |
|
"loss": 0.1001, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 21.33, |
|
"grad_norm": 0.8793210983276367, |
|
"learning_rate": 0.00012266666666666668, |
|
"loss": 0.072, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 22.1, |
|
"grad_norm": 0.9120911955833435, |
|
"learning_rate": 0.00011911111111111111, |
|
"loss": 0.0509, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 22.86, |
|
"grad_norm": 0.7733311653137207, |
|
"learning_rate": 0.00011555555555555555, |
|
"loss": 0.0429, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 23.62, |
|
"grad_norm": 0.6906972527503967, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 0.0377, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 24.38, |
|
"grad_norm": 0.3450298607349396, |
|
"learning_rate": 0.00010844444444444446, |
|
"loss": 0.0271, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 25.14, |
|
"grad_norm": 0.40086209774017334, |
|
"learning_rate": 0.0001048888888888889, |
|
"loss": 0.0264, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 25.9, |
|
"grad_norm": 0.7334154844284058, |
|
"learning_rate": 0.00010133333333333335, |
|
"loss": 0.0212, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 26.67, |
|
"grad_norm": 0.2674214243888855, |
|
"learning_rate": 9.777777777777778e-05, |
|
"loss": 0.0169, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 27.43, |
|
"grad_norm": 0.2615182399749756, |
|
"learning_rate": 9.422222222222223e-05, |
|
"loss": 0.0173, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 28.19, |
|
"grad_norm": 0.12926605343818665, |
|
"learning_rate": 9.066666666666667e-05, |
|
"loss": 0.0149, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 28.95, |
|
"grad_norm": 0.1451052874326706, |
|
"learning_rate": 8.711111111111112e-05, |
|
"loss": 0.0142, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 29.71, |
|
"grad_norm": 0.12337276339530945, |
|
"learning_rate": 8.355555555555556e-05, |
|
"loss": 0.0111, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 30.48, |
|
"grad_norm": 0.10490886121988297, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0126, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 31.24, |
|
"grad_norm": 0.12196756899356842, |
|
"learning_rate": 7.644444444444445e-05, |
|
"loss": 0.01, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.06647361814975739, |
|
"learning_rate": 7.28888888888889e-05, |
|
"loss": 0.0106, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 32.76, |
|
"grad_norm": 0.09191016852855682, |
|
"learning_rate": 6.933333333333334e-05, |
|
"loss": 0.0084, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 33.52, |
|
"grad_norm": 0.08465249091386795, |
|
"learning_rate": 6.577777777777779e-05, |
|
"loss": 0.0093, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 34.29, |
|
"grad_norm": 0.13242019712924957, |
|
"learning_rate": 6.222222222222222e-05, |
|
"loss": 0.0095, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 35.05, |
|
"grad_norm": 0.07912217080593109, |
|
"learning_rate": 5.866666666666667e-05, |
|
"loss": 0.0079, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 35.81, |
|
"grad_norm": 0.08500321954488754, |
|
"learning_rate": 5.511111111111111e-05, |
|
"loss": 0.0069, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 36.57, |
|
"grad_norm": 0.12945592403411865, |
|
"learning_rate": 5.1555555555555556e-05, |
|
"loss": 0.0076, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 37.33, |
|
"grad_norm": 0.05908092483878136, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.0079, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 38.1, |
|
"grad_norm": 0.05590814724564552, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.0071, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 38.86, |
|
"grad_norm": 0.07217204570770264, |
|
"learning_rate": 4.088888888888889e-05, |
|
"loss": 0.0071, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 39.62, |
|
"grad_norm": 0.08633767813444138, |
|
"learning_rate": 3.733333333333334e-05, |
|
"loss": 0.0061, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 40.38, |
|
"grad_norm": 0.08432795852422714, |
|
"learning_rate": 3.377777777777778e-05, |
|
"loss": 0.007, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 41.14, |
|
"grad_norm": 0.06549498438835144, |
|
"learning_rate": 3.0222222222222225e-05, |
|
"loss": 0.0062, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 41.9, |
|
"grad_norm": 0.07968047261238098, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.0056, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 42.67, |
|
"grad_norm": 0.054360050708055496, |
|
"learning_rate": 2.3111111111111112e-05, |
|
"loss": 0.0064, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 43.43, |
|
"grad_norm": 0.08684072643518448, |
|
"learning_rate": 1.9555555555555557e-05, |
|
"loss": 0.0052, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 44.19, |
|
"grad_norm": 0.08560307323932648, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.0066, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 44.95, |
|
"grad_norm": 0.08709636330604553, |
|
"learning_rate": 1.2444444444444445e-05, |
|
"loss": 0.0055, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 45.71, |
|
"grad_norm": 0.03685537353157997, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 0.0054, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 46.48, |
|
"grad_norm": 0.10162707418203354, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.0059, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 47.24, |
|
"grad_norm": 0.061479195952415466, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 0.0051, |
|
"step": 248 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 4.0647058784256e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|