|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9540481400437635, |
|
"eval_steps": 1.0, |
|
"global_step": 108, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.122653587017292, |
|
"learning_rate": 0.0, |
|
"loss": 0.646, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.716859801224774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5427, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.019981578406485, |
|
"learning_rate": 7.924812503605782e-06, |
|
"loss": 0.549, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.467488328063134, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4574, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.728992664952675, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4952, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.970029793004458, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4373, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.152396699018966, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4491, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.2898095220878196, |
|
"learning_rate": 1e-05, |
|
"loss": 0.421, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8312653886857113, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3626, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.8281463002677139, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3628, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.7367109671193546, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3394, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.6255181971686055, |
|
"learning_rate": 1e-05, |
|
"loss": 0.333, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.6622699558938445, |
|
"learning_rate": 1e-05, |
|
"loss": 0.313, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4241097114069272, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2917, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3259856150813727, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2742, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.363385021609024, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2931, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.612345679634704, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2723, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.394500889215576, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2623, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.2629355627249936, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2719, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4121814773595716, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2877132103814335, |
|
"learning_rate": 1e-05, |
|
"loss": 0.244, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2523378620255512, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2349, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1902225037740863, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2265, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1879613096031454, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2144, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1464570881387057, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2063, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.2752585041011075, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2157, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.155543288413803, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1916, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.2042714749577323, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2071, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1346405455010145, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1991, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0793677741286372, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1895, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0641597541926833, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1968, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1159083684301505, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1846, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0614368255136861, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1849, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.9659837421382899, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1677, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9869347069695258, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1789, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.0555806958429526, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1901, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.0003790636897225, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1317, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.8755581193987241, |
|
"learning_rate": 1e-05, |
|
"loss": 0.109, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.9600336243215675, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1225, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.983666193649008, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1206, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.9984047080313273, |
|
"learning_rate": 1e-05, |
|
"loss": 0.114, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.0701560459785802, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1143, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0722426369355342, |
|
"learning_rate": 1e-05, |
|
"loss": 0.124, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.8817516131260538, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1079, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.9625731291493045, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1222, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.9812159017657305, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1096, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.8745591684992073, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1149, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.9507892298975904, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1163, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.8611093549236812, |
|
"learning_rate": 1e-05, |
|
"loss": 0.11, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.8444613309525189, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1054, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.9868965033294682, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1206, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.7940733083936387, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1072, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.8572147743881433, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1044, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.8209526691122747, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1009, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.7779846619307967, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1018, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.8291999953627118, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1025, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.8321877301816655, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1094, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.8542389871893485, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1079, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.7737670400014411, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0958, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.8129322360573784, |
|
"learning_rate": 1e-05, |
|
"loss": 0.093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.8293838232530079, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1054, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.6810818138246434, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0906, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.7937807744835117, |
|
"learning_rate": 1e-05, |
|
"loss": 0.098, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.8224807756832562, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1101, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.7304387601530952, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.8142026342771219, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0964, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.7431609339195293, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0936, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.7512520528680077, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0949, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.7538760164866836, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0989, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.8103341693726498, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1028, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.8357385002863533, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0966, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.7944109386823767, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1032, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.8167924040143067, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1003, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.6224097456559627, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0604, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.6419960808589802, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0652, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.7787247503593108, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0709, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.7252667545531377, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0571, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.7217097658566882, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0656, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.7168372530155407, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0527, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.7418408777634922, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0585, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.7101096286468248, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0509, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.7006749099813174, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0576, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.7944497077223811, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0545, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.6722848545285588, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0563, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.7120414843684311, |
|
"learning_rate": 1e-05, |
|
"loss": 0.058, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.7279580151325783, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0552, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.7029506482473885, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0577, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.6189514282541002, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0547, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.6229759439930223, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0513, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.6863028991803624, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0613, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.7364535429784711, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0652, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.7387032302781582, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0638, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.6757297547267043, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0586, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.6588502217604668, |
|
"learning_rate": 1e-05, |
|
"loss": 0.05, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.6612243757810015, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0565, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.6510872422256165, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0564, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.6599878520531972, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0584, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.6723176479777001, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0596, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.6738851793824463, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0568, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.6730157693288188, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0567, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.6025834169032148, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0543, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.5662111947365751, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0521, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.6744169703896066, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0589, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.6312659616633817, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0544, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.6011739294981976, |
|
"learning_rate": 1e-05, |
|
"loss": 0.055, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.6427838412250556, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0582, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.6537825081243189, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0579, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.6762138754041659, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0588, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"step": 108, |
|
"total_flos": 90174812381184.0, |
|
"train_loss": 0.15581304762788392, |
|
"train_runtime": 6548.7943, |
|
"train_samples_per_second": 2.513, |
|
"train_steps_per_second": 0.016 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 108, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1.0, |
|
"total_flos": 90174812381184.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|