|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4991875203119922, |
|
"eval_steps": 128, |
|
"global_step": 256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.5149741172790527, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.8709, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 1.8383064270019531, |
|
"eval_runtime": 707.8127, |
|
"eval_samples_per_second": 7.169, |
|
"eval_steps_per_second": 1.793, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.48140937089920044, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.7751, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4886001944541931, |
|
"learning_rate": 6e-06, |
|
"loss": 1.795, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.46349120140075684, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.7569, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5320057272911072, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9278, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.48083460330963135, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.778, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.503804624080658, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.8358, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5177507400512695, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.8655, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5006410479545593, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.8087, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.500285804271698, |
|
"learning_rate": 2e-05, |
|
"loss": 1.8254, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.4819566607475281, |
|
"learning_rate": 1.9999804178263253e-05, |
|
"loss": 1.7627, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.4860954284667969, |
|
"learning_rate": 1.999921672072223e-05, |
|
"loss": 1.7034, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5111412405967712, |
|
"learning_rate": 1.9998237650384324e-05, |
|
"loss": 1.7203, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.500988245010376, |
|
"learning_rate": 1.9996867005594193e-05, |
|
"loss": 1.6721, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4903103709220886, |
|
"learning_rate": 1.999510484003224e-05, |
|
"loss": 1.6167, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4756762683391571, |
|
"learning_rate": 1.999295122271253e-05, |
|
"loss": 1.57, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4689522385597229, |
|
"learning_rate": 1.999040623798008e-05, |
|
"loss": 1.5461, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5094612836837769, |
|
"learning_rate": 1.9987469985507553e-05, |
|
"loss": 1.5526, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.49769631028175354, |
|
"learning_rate": 1.9984142580291368e-05, |
|
"loss": 1.5115, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.46388670802116394, |
|
"learning_rate": 1.9980424152647174e-05, |
|
"loss": 1.467, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4357146918773651, |
|
"learning_rate": 1.9976314848204762e-05, |
|
"loss": 1.3887, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.440377414226532, |
|
"learning_rate": 1.997181482790236e-05, |
|
"loss": 1.3845, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4116402566432953, |
|
"learning_rate": 1.9966924267980326e-05, |
|
"loss": 1.4091, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3181552588939667, |
|
"learning_rate": 1.996164335997425e-05, |
|
"loss": 1.3324, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2932267189025879, |
|
"learning_rate": 1.995597231070744e-05, |
|
"loss": 1.315, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.328800231218338, |
|
"learning_rate": 1.994991134228285e-05, |
|
"loss": 1.3334, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.32027724385261536, |
|
"learning_rate": 1.9943460692074345e-05, |
|
"loss": 1.3161, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3247709274291992, |
|
"learning_rate": 1.993662061271743e-05, |
|
"loss": 1.2601, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.33424896001815796, |
|
"learning_rate": 1.9929391372099352e-05, |
|
"loss": 1.2807, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.28847330808639526, |
|
"learning_rate": 1.9921773253348604e-05, |
|
"loss": 1.2427, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2601753771305084, |
|
"learning_rate": 1.991376655482383e-05, |
|
"loss": 1.2602, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.25505828857421875, |
|
"learning_rate": 1.9905371590102157e-05, |
|
"loss": 1.2539, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.25789541006088257, |
|
"learning_rate": 1.989658868796689e-05, |
|
"loss": 1.2796, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.1963696926832199, |
|
"learning_rate": 1.988741819239467e-05, |
|
"loss": 1.2533, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.1652669906616211, |
|
"learning_rate": 1.9877860462541964e-05, |
|
"loss": 1.27, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.15272551774978638, |
|
"learning_rate": 1.986791587273103e-05, |
|
"loss": 1.2092, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.14809414744377136, |
|
"learning_rate": 1.985758481243523e-05, |
|
"loss": 1.2028, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.14091093838214874, |
|
"learning_rate": 1.98468676862638e-05, |
|
"loss": 1.1737, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.13234961032867432, |
|
"learning_rate": 1.9835764913945998e-05, |
|
"loss": 1.2242, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.12562313675880432, |
|
"learning_rate": 1.982427693031465e-05, |
|
"loss": 1.1846, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.12460777163505554, |
|
"learning_rate": 1.981240418528914e-05, |
|
"loss": 1.1954, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1261477917432785, |
|
"learning_rate": 1.9800147143857774e-05, |
|
"loss": 1.1944, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.12070100754499435, |
|
"learning_rate": 1.9787506286059584e-05, |
|
"loss": 1.1814, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1318473368883133, |
|
"learning_rate": 1.9774482106965512e-05, |
|
"loss": 1.2289, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.11869361251592636, |
|
"learning_rate": 1.9761075116659037e-05, |
|
"loss": 1.1507, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.11668427288532257, |
|
"learning_rate": 1.974728584021618e-05, |
|
"loss": 1.1693, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.12271205335855484, |
|
"learning_rate": 1.9733114817684957e-05, |
|
"loss": 1.219, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.12055838108062744, |
|
"learning_rate": 1.9718562604064213e-05, |
|
"loss": 1.2424, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.1191168949007988, |
|
"learning_rate": 1.97036297692819e-05, |
|
"loss": 1.2206, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.11361384391784668, |
|
"learning_rate": 1.9688316898172744e-05, |
|
"loss": 1.1927, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.109556645154953, |
|
"learning_rate": 1.967262459045535e-05, |
|
"loss": 1.2013, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.11278169602155685, |
|
"learning_rate": 1.9656553460708707e-05, |
|
"loss": 1.2379, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.11011548340320587, |
|
"learning_rate": 1.9640104138348124e-05, |
|
"loss": 1.1808, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.09818632155656815, |
|
"learning_rate": 1.9623277267600574e-05, |
|
"loss": 1.1731, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.1045491099357605, |
|
"learning_rate": 1.9606073507479466e-05, |
|
"loss": 1.1729, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.0985143780708313, |
|
"learning_rate": 1.9588493531758843e-05, |
|
"loss": 1.165, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.09513280540704727, |
|
"learning_rate": 1.9570538028946974e-05, |
|
"loss": 1.1765, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.09834066778421402, |
|
"learning_rate": 1.9552207702259412e-05, |
|
"loss": 1.1411, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09748240560293198, |
|
"learning_rate": 1.9533503269591438e-05, |
|
"loss": 1.1995, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09501401335000992, |
|
"learning_rate": 1.9514425463489946e-05, |
|
"loss": 1.1414, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09078366309404373, |
|
"learning_rate": 1.9494975031124768e-05, |
|
"loss": 1.1132, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09064218401908875, |
|
"learning_rate": 1.947515273425939e-05, |
|
"loss": 1.1498, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09029112011194229, |
|
"learning_rate": 1.945495934922113e-05, |
|
"loss": 1.158, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09335145354270935, |
|
"learning_rate": 1.9434395666870735e-05, |
|
"loss": 1.181, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.08959628641605377, |
|
"learning_rate": 1.9413462492571403e-05, |
|
"loss": 1.1353, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.09235028922557831, |
|
"learning_rate": 1.9392160646157242e-05, |
|
"loss": 1.1566, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.08852320164442062, |
|
"learning_rate": 1.937049096190117e-05, |
|
"loss": 1.1015, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.09060905128717422, |
|
"learning_rate": 1.934845428848222e-05, |
|
"loss": 1.1312, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.09065355360507965, |
|
"learning_rate": 1.9326051488952334e-05, |
|
"loss": 1.1456, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.09140690416097641, |
|
"learning_rate": 1.9303283440702524e-05, |
|
"loss": 1.1661, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.08641023188829422, |
|
"learning_rate": 1.9280151035428544e-05, |
|
"loss": 1.1153, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.08729224652051926, |
|
"learning_rate": 1.9256655179095954e-05, |
|
"loss": 1.1956, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.08514908701181412, |
|
"learning_rate": 1.9232796791904627e-05, |
|
"loss": 1.0969, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.08789129555225372, |
|
"learning_rate": 1.9208576808252725e-05, |
|
"loss": 1.1669, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.0829731896519661, |
|
"learning_rate": 1.918399617670011e-05, |
|
"loss": 1.101, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.08415351063013077, |
|
"learning_rate": 1.9159055859931163e-05, |
|
"loss": 1.122, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.07933653146028519, |
|
"learning_rate": 1.9133756834717118e-05, |
|
"loss": 1.1175, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.0849999189376831, |
|
"learning_rate": 1.9108100091877787e-05, |
|
"loss": 1.1577, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.0835108831524849, |
|
"learning_rate": 1.9082086636242757e-05, |
|
"loss": 1.1253, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07834841310977936, |
|
"learning_rate": 1.905571748661204e-05, |
|
"loss": 1.0963, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07953493297100067, |
|
"learning_rate": 1.902899367571617e-05, |
|
"loss": 1.1102, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07989759743213654, |
|
"learning_rate": 1.9001916250175764e-05, |
|
"loss": 1.1576, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07849448174238205, |
|
"learning_rate": 1.8974486270460518e-05, |
|
"loss": 1.0963, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07805287837982178, |
|
"learning_rate": 1.894670481084769e-05, |
|
"loss": 1.1364, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.07698098570108414, |
|
"learning_rate": 1.8918572959380005e-05, |
|
"loss": 1.1407, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.0766262486577034, |
|
"learning_rate": 1.8890091817823073e-05, |
|
"loss": 1.1225, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.0798678770661354, |
|
"learning_rate": 1.8861262501622213e-05, |
|
"loss": 1.137, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.07717825472354889, |
|
"learning_rate": 1.8832086139858777e-05, |
|
"loss": 1.1311, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.07542562484741211, |
|
"learning_rate": 1.880256387520593e-05, |
|
"loss": 1.1066, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.07316063344478607, |
|
"learning_rate": 1.8772696863883905e-05, |
|
"loss": 1.0976, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.0738874301314354, |
|
"learning_rate": 1.8742486275614706e-05, |
|
"loss": 1.0901, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.07698226720094681, |
|
"learning_rate": 1.8711933293576303e-05, |
|
"loss": 1.1224, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.07452582567930222, |
|
"learning_rate": 1.8681039114356298e-05, |
|
"loss": 1.1399, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.07452700287103653, |
|
"learning_rate": 1.8649804947905057e-05, |
|
"loss": 1.1639, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07358838617801666, |
|
"learning_rate": 1.861823201748833e-05, |
|
"loss": 1.1139, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07469804584980011, |
|
"learning_rate": 1.8586321559639316e-05, |
|
"loss": 1.1103, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07484911382198334, |
|
"learning_rate": 1.8554074824110285e-05, |
|
"loss": 1.1231, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07320189476013184, |
|
"learning_rate": 1.8521493073823583e-05, |
|
"loss": 1.1405, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07219311594963074, |
|
"learning_rate": 1.8488577584822197e-05, |
|
"loss": 1.1084, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07267658412456512, |
|
"learning_rate": 1.8455329646219767e-05, |
|
"loss": 1.109, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.07124843448400497, |
|
"learning_rate": 1.8421750560150112e-05, |
|
"loss": 1.0997, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.06921572983264923, |
|
"learning_rate": 1.8387841641716226e-05, |
|
"loss": 1.1095, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.07149618864059448, |
|
"learning_rate": 1.835360421893876e-05, |
|
"loss": 1.1078, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.07851895689964294, |
|
"learning_rate": 1.8319039632704042e-05, |
|
"loss": 1.1195, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.07615454494953156, |
|
"learning_rate": 1.8284149236711527e-05, |
|
"loss": 1.0754, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.07054944336414337, |
|
"learning_rate": 1.8248934397420802e-05, |
|
"loss": 1.0943, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.07253159582614899, |
|
"learning_rate": 1.821339649399807e-05, |
|
"loss": 1.1263, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.0729857012629509, |
|
"learning_rate": 1.817753691826212e-05, |
|
"loss": 1.0977, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.07234011590480804, |
|
"learning_rate": 1.8141357074629838e-05, |
|
"loss": 1.1334, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.07030120491981506, |
|
"learning_rate": 1.8104858380061178e-05, |
|
"loss": 1.0767, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.07036615908145905, |
|
"learning_rate": 1.80680422640037e-05, |
|
"loss": 1.0796, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.0742933601140976, |
|
"learning_rate": 1.8030910168336558e-05, |
|
"loss": 1.0671, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.07065165787935257, |
|
"learning_rate": 1.7993463547314044e-05, |
|
"loss": 1.1594, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.07182008028030396, |
|
"learning_rate": 1.7955703867508634e-05, |
|
"loss": 1.0936, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.06882106512784958, |
|
"learning_rate": 1.791763260775354e-05, |
|
"loss": 1.1017, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.07001936435699463, |
|
"learning_rate": 1.7879251259084803e-05, |
|
"loss": 1.1267, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.06916490197181702, |
|
"learning_rate": 1.78405613246829e-05, |
|
"loss": 1.0787, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.07149837166070938, |
|
"learning_rate": 1.7801564319813854e-05, |
|
"loss": 1.1302, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.06783504039049149, |
|
"learning_rate": 1.776226177176991e-05, |
|
"loss": 1.1159, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.07285293936729431, |
|
"learning_rate": 1.7722655219809718e-05, |
|
"loss": 1.0758, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.07273004204034805, |
|
"learning_rate": 1.768274621509803e-05, |
|
"loss": 1.1019, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.07392899692058563, |
|
"learning_rate": 1.7642536320644964e-05, |
|
"loss": 1.1111, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.0693732351064682, |
|
"learning_rate": 1.7602027111244807e-05, |
|
"loss": 1.1109, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.0721542090177536, |
|
"learning_rate": 1.7561220173414297e-05, |
|
"loss": 1.1246, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.07002190500497818, |
|
"learning_rate": 1.7520117105330524e-05, |
|
"loss": 1.073, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.0697953850030899, |
|
"learning_rate": 1.7478719516768324e-05, |
|
"loss": 1.0913, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.07040461152791977, |
|
"learning_rate": 1.7437029029037233e-05, |
|
"loss": 1.1445, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.07231634110212326, |
|
"learning_rate": 1.7395047274917994e-05, |
|
"loss": 1.1106, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.0988876819610596, |
|
"eval_runtime": 708.4228, |
|
"eval_samples_per_second": 7.162, |
|
"eval_steps_per_second": 1.791, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.0713375061750412, |
|
"learning_rate": 1.7352775898598615e-05, |
|
"loss": 1.0982, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06747942417860031, |
|
"learning_rate": 1.731021655560995e-05, |
|
"loss": 1.1017, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.071540467441082, |
|
"learning_rate": 1.72673709127609e-05, |
|
"loss": 1.0859, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06861750036478043, |
|
"learning_rate": 1.7224240648073097e-05, |
|
"loss": 1.0728, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.06919445842504501, |
|
"learning_rate": 1.718082745071521e-05, |
|
"loss": 1.1218, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.07422851771116257, |
|
"learning_rate": 1.7137133020936783e-05, |
|
"loss": 1.0881, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.07452652603387833, |
|
"learning_rate": 1.7093159070001637e-05, |
|
"loss": 1.1073, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07337850332260132, |
|
"learning_rate": 1.7048907320120867e-05, |
|
"loss": 1.1065, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07020066678524017, |
|
"learning_rate": 1.700437950438537e-05, |
|
"loss": 1.0742, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07053718715906143, |
|
"learning_rate": 1.695957736669799e-05, |
|
"loss": 1.0627, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07288292795419693, |
|
"learning_rate": 1.6914502661705216e-05, |
|
"loss": 1.0842, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07197044044733047, |
|
"learning_rate": 1.6869157154728437e-05, |
|
"loss": 1.065, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07109569013118744, |
|
"learning_rate": 1.6823542621694852e-05, |
|
"loss": 1.0996, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07084467262029648, |
|
"learning_rate": 1.677766084906787e-05, |
|
"loss": 1.0862, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07195379585027695, |
|
"learning_rate": 1.6731513633777173e-05, |
|
"loss": 1.1184, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07326792180538177, |
|
"learning_rate": 1.668510278314833e-05, |
|
"loss": 1.0867, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07582233846187592, |
|
"learning_rate": 1.6638430114832015e-05, |
|
"loss": 1.0721, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07204006612300873, |
|
"learning_rate": 1.6591497456732827e-05, |
|
"loss": 1.0565, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.07225130498409271, |
|
"learning_rate": 1.6544306646937683e-05, |
|
"loss": 1.1036, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.07662148773670197, |
|
"learning_rate": 1.649685953364385e-05, |
|
"loss": 1.0289, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.07611638307571411, |
|
"learning_rate": 1.644915797508656e-05, |
|
"loss": 1.1068, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.07609565556049347, |
|
"learning_rate": 1.6401203839466212e-05, |
|
"loss": 1.0816, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.0737641304731369, |
|
"learning_rate": 1.6352999004875242e-05, |
|
"loss": 1.1016, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07359515875577927, |
|
"learning_rate": 1.630454535922452e-05, |
|
"loss": 1.0787, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07506351917982101, |
|
"learning_rate": 1.6255844800169472e-05, |
|
"loss": 1.0789, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07777760922908783, |
|
"learning_rate": 1.62068992350357e-05, |
|
"loss": 1.096, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07574637979269028, |
|
"learning_rate": 1.6157710580744322e-05, |
|
"loss": 1.1007, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.07857154309749603, |
|
"learning_rate": 1.610828076373687e-05, |
|
"loss": 1.0735, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.07402702420949936, |
|
"learning_rate": 1.605861171989988e-05, |
|
"loss": 1.1003, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.07439373433589935, |
|
"learning_rate": 1.6008705394489032e-05, |
|
"loss": 1.0662, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.07392847537994385, |
|
"learning_rate": 1.5958563742052987e-05, |
|
"loss": 1.0487, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.07773245126008987, |
|
"learning_rate": 1.5908188726356843e-05, |
|
"loss": 1.1107, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.07752656936645508, |
|
"learning_rate": 1.5857582320305207e-05, |
|
"loss": 1.0426, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07541097700595856, |
|
"learning_rate": 1.5806746505864947e-05, |
|
"loss": 1.081, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07938623428344727, |
|
"learning_rate": 1.5755683273987554e-05, |
|
"loss": 1.0969, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07379717379808426, |
|
"learning_rate": 1.5704394624531184e-05, |
|
"loss": 1.0763, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07850446552038193, |
|
"learning_rate": 1.5652882566182316e-05, |
|
"loss": 1.1029, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07627106457948685, |
|
"learning_rate": 1.5601149116377095e-05, |
|
"loss": 1.0611, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.07577154785394669, |
|
"learning_rate": 1.554919630122232e-05, |
|
"loss": 1.0973, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.07844171673059464, |
|
"learning_rate": 1.5497026155416087e-05, |
|
"loss": 1.1006, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.08061926811933517, |
|
"learning_rate": 1.5444640722168114e-05, |
|
"loss": 1.0879, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.07918211817741394, |
|
"learning_rate": 1.53920420531197e-05, |
|
"loss": 1.0602, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.08213488012552261, |
|
"learning_rate": 1.5339232208263394e-05, |
|
"loss": 1.0798, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.07898285239934921, |
|
"learning_rate": 1.5286213255862295e-05, |
|
"loss": 1.0969, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.08233582973480225, |
|
"learning_rate": 1.5232987272369076e-05, |
|
"loss": 1.0699, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.08074311912059784, |
|
"learning_rate": 1.5179556342344643e-05, |
|
"loss": 1.0851, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.08196305483579636, |
|
"learning_rate": 1.51259225583765e-05, |
|
"loss": 1.076, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.08637065440416336, |
|
"learning_rate": 1.5072088020996791e-05, |
|
"loss": 1.0989, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08313170820474625, |
|
"learning_rate": 1.5018054838600033e-05, |
|
"loss": 1.09, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08245568722486496, |
|
"learning_rate": 1.496382512736056e-05, |
|
"loss": 1.0572, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08442118763923645, |
|
"learning_rate": 1.490940101114961e-05, |
|
"loss": 1.0669, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08224523812532425, |
|
"learning_rate": 1.4854784621452176e-05, |
|
"loss": 1.0842, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08642537891864777, |
|
"learning_rate": 1.479997809728352e-05, |
|
"loss": 1.123, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.08723440766334534, |
|
"learning_rate": 1.4744983585105388e-05, |
|
"loss": 1.0649, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.08666212856769562, |
|
"learning_rate": 1.4689803238741955e-05, |
|
"loss": 1.0938, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.09213647246360779, |
|
"learning_rate": 1.463443921929548e-05, |
|
"loss": 1.0903, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.08998877555131912, |
|
"learning_rate": 1.4578893695061644e-05, |
|
"loss": 1.0778, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.09158129245042801, |
|
"learning_rate": 1.4523168841444657e-05, |
|
"loss": 1.0932, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.09460633993148804, |
|
"learning_rate": 1.4467266840872041e-05, |
|
"loss": 1.0691, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.09502755105495453, |
|
"learning_rate": 1.441118988270916e-05, |
|
"loss": 1.0684, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.09307122975587845, |
|
"learning_rate": 1.4354940163173486e-05, |
|
"loss": 1.0776, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.09580650180578232, |
|
"learning_rate": 1.4298519885248574e-05, |
|
"loss": 1.0882, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.09251459687948227, |
|
"learning_rate": 1.4241931258597781e-05, |
|
"loss": 1.077, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.09432998299598694, |
|
"learning_rate": 1.4185176499477742e-05, |
|
"loss": 1.0012, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.09586652368307114, |
|
"learning_rate": 1.4128257830651554e-05, |
|
"loss": 1.0334, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.09538242220878601, |
|
"learning_rate": 1.407117748130174e-05, |
|
"loss": 1.0731, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.09691152721643448, |
|
"learning_rate": 1.401393768694292e-05, |
|
"loss": 1.0412, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.09779084473848343, |
|
"learning_rate": 1.3956540689334286e-05, |
|
"loss": 1.0602, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0998532623052597, |
|
"learning_rate": 1.3898988736391792e-05, |
|
"loss": 1.0261, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.10739872604608536, |
|
"learning_rate": 1.384128408210011e-05, |
|
"loss": 1.0502, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.11806387454271317, |
|
"learning_rate": 1.3783428986424366e-05, |
|
"loss": 1.1188, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.10208501666784286, |
|
"learning_rate": 1.3725425715221625e-05, |
|
"loss": 1.0465, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1044783741235733, |
|
"learning_rate": 1.3667276540152143e-05, |
|
"loss": 1.0561, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1070132926106453, |
|
"learning_rate": 1.3608983738590414e-05, |
|
"loss": 1.0429, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.11181865632534027, |
|
"learning_rate": 1.3550549593535965e-05, |
|
"loss": 1.0564, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.11098324507474899, |
|
"learning_rate": 1.3491976393523952e-05, |
|
"loss": 1.0632, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.10281454026699066, |
|
"learning_rate": 1.343326643253552e-05, |
|
"loss": 1.0637, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.10408665239810944, |
|
"learning_rate": 1.3374422009907984e-05, |
|
"loss": 1.0701, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.10533872246742249, |
|
"learning_rate": 1.3315445430244744e-05, |
|
"loss": 1.0654, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.10545054078102112, |
|
"learning_rate": 1.3256339003325054e-05, |
|
"loss": 1.0518, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.09894714504480362, |
|
"learning_rate": 1.3197105044013544e-05, |
|
"loss": 1.0671, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.08720172196626663, |
|
"learning_rate": 1.3137745872169578e-05, |
|
"loss": 1.0127, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.08827454596757889, |
|
"learning_rate": 1.3078263812556377e-05, |
|
"loss": 1.0154, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.0914626345038414, |
|
"learning_rate": 1.3018661194749986e-05, |
|
"loss": 1.0201, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.08843535929918289, |
|
"learning_rate": 1.295894035304803e-05, |
|
"loss": 1.0516, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.08639541268348694, |
|
"learning_rate": 1.28991036263783e-05, |
|
"loss": 1.0165, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.07750130444765091, |
|
"learning_rate": 1.2839153358207142e-05, |
|
"loss": 1.0223, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.0824190005660057, |
|
"learning_rate": 1.2779091896447682e-05, |
|
"loss": 1.0337, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.08451572805643082, |
|
"learning_rate": 1.2718921593367874e-05, |
|
"loss": 1.0542, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0857366994023323, |
|
"learning_rate": 1.2658644805498361e-05, |
|
"loss": 1.0759, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.07681415975093842, |
|
"learning_rate": 1.2598263893540207e-05, |
|
"loss": 1.0506, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.07856535911560059, |
|
"learning_rate": 1.2537781222272423e-05, |
|
"loss": 1.0974, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.08015410602092743, |
|
"learning_rate": 1.2477199160459345e-05, |
|
"loss": 1.0604, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.08314133435487747, |
|
"learning_rate": 1.2416520080757892e-05, |
|
"loss": 1.0889, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.08028203994035721, |
|
"learning_rate": 1.2355746359624621e-05, |
|
"loss": 1.0281, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0775797963142395, |
|
"learning_rate": 1.2294880377222649e-05, |
|
"loss": 1.078, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.08315123617649078, |
|
"learning_rate": 1.2233924517328456e-05, |
|
"loss": 1.0356, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0795183852314949, |
|
"learning_rate": 1.2172881167238515e-05, |
|
"loss": 1.0332, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0779062882065773, |
|
"learning_rate": 1.2111752717675788e-05, |
|
"loss": 0.9954, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.07758854329586029, |
|
"learning_rate": 1.205054156269611e-05, |
|
"loss": 1.0242, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.07713694125413895, |
|
"learning_rate": 1.1989250099594412e-05, |
|
"loss": 1.0686, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.07772821933031082, |
|
"learning_rate": 1.192788072881085e-05, |
|
"loss": 1.0338, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.08006665855646133, |
|
"learning_rate": 1.1866435853836773e-05, |
|
"loss": 1.0946, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.0821637436747551, |
|
"learning_rate": 1.1804917881120608e-05, |
|
"loss": 1.0525, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.07892850786447525, |
|
"learning_rate": 1.1743329219973609e-05, |
|
"loss": 1.0127, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.07800798863172531, |
|
"learning_rate": 1.1681672282475495e-05, |
|
"loss": 1.0254, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.07875402271747589, |
|
"learning_rate": 1.161994948337998e-05, |
|
"loss": 1.0319, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.08178096264600754, |
|
"learning_rate": 1.1558163240020209e-05, |
|
"loss": 1.0541, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.08126726001501083, |
|
"learning_rate": 1.1496315972214076e-05, |
|
"loss": 1.0681, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.08104463666677475, |
|
"learning_rate": 1.1434410102169462e-05, |
|
"loss": 0.9767, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.0746295303106308, |
|
"learning_rate": 1.1372448054389364e-05, |
|
"loss": 1.0586, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.08171354979276657, |
|
"learning_rate": 1.1310432255576944e-05, |
|
"loss": 1.0655, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.08069796115159988, |
|
"learning_rate": 1.1248365134540489e-05, |
|
"loss": 1.079, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.07922904193401337, |
|
"learning_rate": 1.1186249122098282e-05, |
|
"loss": 1.0371, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.07877922058105469, |
|
"learning_rate": 1.1124086650983415e-05, |
|
"loss": 1.0236, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.07606945931911469, |
|
"learning_rate": 1.1061880155748497e-05, |
|
"loss": 1.0255, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.08225277811288834, |
|
"learning_rate": 1.0999632072670314e-05, |
|
"loss": 1.0571, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.07907744497060776, |
|
"learning_rate": 1.0937344839654416e-05, |
|
"loss": 1.0745, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.07885382324457169, |
|
"learning_rate": 1.087502089613963e-05, |
|
"loss": 0.9899, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.08236192911863327, |
|
"learning_rate": 1.0812662683002528e-05, |
|
"loss": 1.046, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.08153583109378815, |
|
"learning_rate": 1.075027264246183e-05, |
|
"loss": 1.0769, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.0847182348370552, |
|
"learning_rate": 1.068785321798276e-05, |
|
"loss": 1.0695, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.07414229959249496, |
|
"learning_rate": 1.062540685418133e-05, |
|
"loss": 1.0555, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.07932449132204056, |
|
"learning_rate": 1.0562935996728629e-05, |
|
"loss": 1.0644, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.08247576653957367, |
|
"learning_rate": 1.0500443092255017e-05, |
|
"loss": 1.064, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.07860003411769867, |
|
"learning_rate": 1.043793058825431e-05, |
|
"loss": 1.0579, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.08330255001783371, |
|
"learning_rate": 1.0375400932987932e-05, |
|
"loss": 1.0218, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.08150562644004822, |
|
"learning_rate": 1.0312856575389016e-05, |
|
"loss": 1.0379, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0509783029556274, |
|
"eval_runtime": 708.357, |
|
"eval_samples_per_second": 7.163, |
|
"eval_steps_per_second": 1.791, |
|
"step": 256 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 512, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 256, |
|
"total_flos": 2.262770368118784e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|