|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 34.44804382324219, |
|
"learning_rate": 1e-05, |
|
"loss": 13.0101, |
|
"mean_token_accuracy": 0.4696590006351471, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 30.779788970947266, |
|
"learning_rate": 2e-05, |
|
"loss": 12.3851, |
|
"mean_token_accuracy": 0.47303473204374313, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 29.67559242248535, |
|
"learning_rate": 3e-05, |
|
"loss": 12.3488, |
|
"mean_token_accuracy": 0.49709559231996536, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 26.862010955810547, |
|
"learning_rate": 4e-05, |
|
"loss": 11.6596, |
|
"mean_token_accuracy": 0.5584611147642136, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 22.10072135925293, |
|
"learning_rate": 5e-05, |
|
"loss": 10.1384, |
|
"mean_token_accuracy": 0.5924926251173019, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 20.171361923217773, |
|
"learning_rate": 4.98989898989899e-05, |
|
"loss": 9.5421, |
|
"mean_token_accuracy": 0.5888276249170303, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 16.452842712402344, |
|
"learning_rate": 4.97979797979798e-05, |
|
"loss": 8.4344, |
|
"mean_token_accuracy": 0.632336363196373, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 12.62137222290039, |
|
"learning_rate": 4.9696969696969694e-05, |
|
"loss": 7.7758, |
|
"mean_token_accuracy": 0.6633019298315048, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 11.533007621765137, |
|
"learning_rate": 4.9595959595959594e-05, |
|
"loss": 7.5523, |
|
"mean_token_accuracy": 0.6721473336219788, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.251978874206543, |
|
"learning_rate": 4.94949494949495e-05, |
|
"loss": 7.5504, |
|
"mean_token_accuracy": 0.6924590468406677, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 9.099457740783691, |
|
"learning_rate": 4.93939393939394e-05, |
|
"loss": 7.1883, |
|
"mean_token_accuracy": 0.694612979888916, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 9.096614837646484, |
|
"learning_rate": 4.92929292929293e-05, |
|
"loss": 6.7714, |
|
"mean_token_accuracy": 0.7158520817756653, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 8.9071044921875, |
|
"learning_rate": 4.919191919191919e-05, |
|
"loss": 6.7582, |
|
"mean_token_accuracy": 0.7046481072902679, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 8.336892127990723, |
|
"learning_rate": 4.909090909090909e-05, |
|
"loss": 6.639, |
|
"mean_token_accuracy": 0.6894638538360596, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.760809421539307, |
|
"learning_rate": 4.898989898989899e-05, |
|
"loss": 6.1283, |
|
"mean_token_accuracy": 0.7323453426361084, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 10.2608003616333, |
|
"learning_rate": 4.888888888888889e-05, |
|
"loss": 6.3731, |
|
"mean_token_accuracy": 0.7066609710454941, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 9.326302528381348, |
|
"learning_rate": 4.878787878787879e-05, |
|
"loss": 6.0298, |
|
"mean_token_accuracy": 0.7226346284151077, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 9.35575008392334, |
|
"learning_rate": 4.868686868686869e-05, |
|
"loss": 6.1363, |
|
"mean_token_accuracy": 0.7228655517101288, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 7.2700018882751465, |
|
"learning_rate": 4.858585858585859e-05, |
|
"loss": 6.3529, |
|
"mean_token_accuracy": 0.7191445082426071, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.965806484222412, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 6.1908, |
|
"mean_token_accuracy": 0.7225745767354965, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 9.064255714416504, |
|
"learning_rate": 4.838383838383839e-05, |
|
"loss": 6.0033, |
|
"mean_token_accuracy": 0.7359140962362289, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 6.869060039520264, |
|
"learning_rate": 4.828282828282829e-05, |
|
"loss": 5.1424, |
|
"mean_token_accuracy": 0.7650310397148132, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 7.401363372802734, |
|
"learning_rate": 4.8181818181818186e-05, |
|
"loss": 6.2128, |
|
"mean_token_accuracy": 0.7163093239068985, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 7.535747051239014, |
|
"learning_rate": 4.808080808080808e-05, |
|
"loss": 5.561, |
|
"mean_token_accuracy": 0.7451221346855164, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.211135864257812, |
|
"learning_rate": 4.797979797979798e-05, |
|
"loss": 5.265, |
|
"mean_token_accuracy": 0.7609298378229141, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 6.720511436462402, |
|
"learning_rate": 4.787878787878788e-05, |
|
"loss": 5.3244, |
|
"mean_token_accuracy": 0.7525999993085861, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 9.114042282104492, |
|
"learning_rate": 4.7777777777777784e-05, |
|
"loss": 5.6173, |
|
"mean_token_accuracy": 0.7520534992218018, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 7.770754337310791, |
|
"learning_rate": 4.7676767676767684e-05, |
|
"loss": 5.5093, |
|
"mean_token_accuracy": 0.7558625787496567, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 7.749612808227539, |
|
"learning_rate": 4.7575757575757576e-05, |
|
"loss": 5.5386, |
|
"mean_token_accuracy": 0.7553819268941879, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.486739635467529, |
|
"learning_rate": 4.7474747474747476e-05, |
|
"loss": 5.73, |
|
"mean_token_accuracy": 0.732807844877243, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 7.474062919616699, |
|
"learning_rate": 4.7373737373737375e-05, |
|
"loss": 6.0216, |
|
"mean_token_accuracy": 0.742279127240181, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 7.018927574157715, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 5.0907, |
|
"mean_token_accuracy": 0.7746435850858688, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 6.719446659088135, |
|
"learning_rate": 4.7171717171717174e-05, |
|
"loss": 4.9967, |
|
"mean_token_accuracy": 0.7717019468545914, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 6.464803695678711, |
|
"learning_rate": 4.7070707070707074e-05, |
|
"loss": 5.2966, |
|
"mean_token_accuracy": 0.7656355202198029, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 7.2371320724487305, |
|
"learning_rate": 4.696969696969697e-05, |
|
"loss": 5.6333, |
|
"mean_token_accuracy": 0.7543937265872955, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 6.18785285949707, |
|
"learning_rate": 4.686868686868687e-05, |
|
"loss": 5.3846, |
|
"mean_token_accuracy": 0.7626298367977142, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 5.636417865753174, |
|
"learning_rate": 4.676767676767677e-05, |
|
"loss": 4.7779, |
|
"mean_token_accuracy": 0.796937882900238, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 6.951849460601807, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 5.3062, |
|
"mean_token_accuracy": 0.7514433115720749, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 7.596467018127441, |
|
"learning_rate": 4.656565656565657e-05, |
|
"loss": 5.4916, |
|
"mean_token_accuracy": 0.738706648349762, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.036402225494385, |
|
"learning_rate": 4.6464646464646464e-05, |
|
"loss": 5.4346, |
|
"mean_token_accuracy": 0.7636198997497559, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 7.065268039703369, |
|
"learning_rate": 4.636363636363636e-05, |
|
"loss": 4.9636, |
|
"mean_token_accuracy": 0.76950503885746, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 5.93984842300415, |
|
"learning_rate": 4.626262626262626e-05, |
|
"loss": 4.9883, |
|
"mean_token_accuracy": 0.7728704810142517, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 6.929603576660156, |
|
"learning_rate": 4.616161616161616e-05, |
|
"loss": 4.6927, |
|
"mean_token_accuracy": 0.7756397873163223, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 7.2813496589660645, |
|
"learning_rate": 4.606060606060607e-05, |
|
"loss": 4.761, |
|
"mean_token_accuracy": 0.784713864326477, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.584592342376709, |
|
"learning_rate": 4.595959595959596e-05, |
|
"loss": 4.7331, |
|
"mean_token_accuracy": 0.787657305598259, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 7.5701751708984375, |
|
"learning_rate": 4.585858585858586e-05, |
|
"loss": 4.1914, |
|
"mean_token_accuracy": 0.796615332365036, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 6.217738628387451, |
|
"learning_rate": 4.575757575757576e-05, |
|
"loss": 4.6948, |
|
"mean_token_accuracy": 0.7924428284168243, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 6.909704685211182, |
|
"learning_rate": 4.565656565656566e-05, |
|
"loss": 5.0747, |
|
"mean_token_accuracy": 0.7648352980613708, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 7.039934158325195, |
|
"learning_rate": 4.555555555555556e-05, |
|
"loss": 5.168, |
|
"mean_token_accuracy": 0.7628057301044464, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.007484436035156, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 5.5139, |
|
"mean_token_accuracy": 0.7507320195436478, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 7.163206100463867, |
|
"learning_rate": 4.535353535353535e-05, |
|
"loss": 4.6769, |
|
"mean_token_accuracy": 0.789847657084465, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 6.482788562774658, |
|
"learning_rate": 4.525252525252526e-05, |
|
"loss": 4.0523, |
|
"mean_token_accuracy": 0.8091136366128922, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 6.3435564041137695, |
|
"learning_rate": 4.515151515151516e-05, |
|
"loss": 5.0074, |
|
"mean_token_accuracy": 0.7676153928041458, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 6.715310096740723, |
|
"learning_rate": 4.5050505050505056e-05, |
|
"loss": 4.7588, |
|
"mean_token_accuracy": 0.7704059928655624, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.287456512451172, |
|
"learning_rate": 4.494949494949495e-05, |
|
"loss": 5.0595, |
|
"mean_token_accuracy": 0.7789322882890701, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 6.842898845672607, |
|
"learning_rate": 4.484848484848485e-05, |
|
"loss": 4.3176, |
|
"mean_token_accuracy": 0.7993331402540207, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 5.992519378662109, |
|
"learning_rate": 4.474747474747475e-05, |
|
"loss": 5.0297, |
|
"mean_token_accuracy": 0.779092013835907, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 6.67594051361084, |
|
"learning_rate": 4.464646464646465e-05, |
|
"loss": 4.1185, |
|
"mean_token_accuracy": 0.7971874326467514, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 6.712916374206543, |
|
"learning_rate": 4.454545454545455e-05, |
|
"loss": 4.6345, |
|
"mean_token_accuracy": 0.7811515778303146, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.204624652862549, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 4.5869, |
|
"mean_token_accuracy": 0.8028187602758408, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 7.798375129699707, |
|
"learning_rate": 4.4343434343434346e-05, |
|
"loss": 5.2535, |
|
"mean_token_accuracy": 0.7713681906461716, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 6.647665500640869, |
|
"learning_rate": 4.4242424242424246e-05, |
|
"loss": 4.1662, |
|
"mean_token_accuracy": 0.809452161192894, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 7.158292770385742, |
|
"learning_rate": 4.4141414141414145e-05, |
|
"loss": 4.5907, |
|
"mean_token_accuracy": 0.7843180149793625, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 6.763492107391357, |
|
"learning_rate": 4.4040404040404044e-05, |
|
"loss": 4.6913, |
|
"mean_token_accuracy": 0.7733302861452103, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 6.745001792907715, |
|
"learning_rate": 4.3939393939393944e-05, |
|
"loss": 4.7438, |
|
"mean_token_accuracy": 0.7817106246948242, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 5.92717981338501, |
|
"learning_rate": 4.383838383838384e-05, |
|
"loss": 4.3055, |
|
"mean_token_accuracy": 0.7931149005889893, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 7.291048049926758, |
|
"learning_rate": 4.3737373737373736e-05, |
|
"loss": 4.7281, |
|
"mean_token_accuracy": 0.7924042791128159, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 6.6944899559021, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 5.4566, |
|
"mean_token_accuracy": 0.7465656846761703, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 6.43382453918457, |
|
"learning_rate": 4.3535353535353535e-05, |
|
"loss": 5.1924, |
|
"mean_token_accuracy": 0.765620231628418, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.896528244018555, |
|
"learning_rate": 4.343434343434344e-05, |
|
"loss": 4.535, |
|
"mean_token_accuracy": 0.780782625079155, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 7.0929059982299805, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 4.7798, |
|
"mean_token_accuracy": 0.7812370210886002, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 6.4265031814575195, |
|
"learning_rate": 4.3232323232323234e-05, |
|
"loss": 4.0221, |
|
"mean_token_accuracy": 0.8055804222822189, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 6.464663982391357, |
|
"learning_rate": 4.313131313131313e-05, |
|
"loss": 5.3328, |
|
"mean_token_accuracy": 0.7507916688919067, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 6.163589000701904, |
|
"learning_rate": 4.303030303030303e-05, |
|
"loss": 3.8647, |
|
"mean_token_accuracy": 0.8181672990322113, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.442709445953369, |
|
"learning_rate": 4.292929292929293e-05, |
|
"loss": 4.1589, |
|
"mean_token_accuracy": 0.7894054055213928, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 6.7371673583984375, |
|
"learning_rate": 4.282828282828283e-05, |
|
"loss": 4.4978, |
|
"mean_token_accuracy": 0.7749454975128174, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 6.681241035461426, |
|
"learning_rate": 4.2727272727272724e-05, |
|
"loss": 4.3404, |
|
"mean_token_accuracy": 0.8082751482725143, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 6.194783687591553, |
|
"learning_rate": 4.262626262626263e-05, |
|
"loss": 4.1043, |
|
"mean_token_accuracy": 0.7980503439903259, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 6.758218288421631, |
|
"learning_rate": 4.252525252525253e-05, |
|
"loss": 4.4278, |
|
"mean_token_accuracy": 0.7885774075984955, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.5866546630859375, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 4.227, |
|
"mean_token_accuracy": 0.7932698577642441, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 7.192435264587402, |
|
"learning_rate": 4.232323232323233e-05, |
|
"loss": 4.0345, |
|
"mean_token_accuracy": 0.8199726790189743, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 5.501450061798096, |
|
"learning_rate": 4.222222222222222e-05, |
|
"loss": 4.2629, |
|
"mean_token_accuracy": 0.7981265485286713, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 6.11761474609375, |
|
"learning_rate": 4.212121212121212e-05, |
|
"loss": 4.2, |
|
"mean_token_accuracy": 0.8049240857362747, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 6.632277011871338, |
|
"learning_rate": 4.202020202020202e-05, |
|
"loss": 4.5717, |
|
"mean_token_accuracy": 0.784144937992096, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.618972301483154, |
|
"learning_rate": 4.191919191919192e-05, |
|
"loss": 4.1424, |
|
"mean_token_accuracy": 0.8174345791339874, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 6.348287105560303, |
|
"learning_rate": 4.181818181818182e-05, |
|
"loss": 3.9289, |
|
"mean_token_accuracy": 0.8103939890861511, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 6.326577186584473, |
|
"learning_rate": 4.171717171717172e-05, |
|
"loss": 4.4158, |
|
"mean_token_accuracy": 0.7934867739677429, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 5.938681602478027, |
|
"learning_rate": 4.161616161616162e-05, |
|
"loss": 4.3607, |
|
"mean_token_accuracy": 0.8023031949996948, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 6.043774604797363, |
|
"learning_rate": 4.151515151515152e-05, |
|
"loss": 4.8988, |
|
"mean_token_accuracy": 0.7769377380609512, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.513429641723633, |
|
"learning_rate": 4.141414141414142e-05, |
|
"loss": 4.469, |
|
"mean_token_accuracy": 0.7987037748098373, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 6.306874752044678, |
|
"learning_rate": 4.131313131313132e-05, |
|
"loss": 3.8623, |
|
"mean_token_accuracy": 0.8089967370033264, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 6.43092155456543, |
|
"learning_rate": 4.1212121212121216e-05, |
|
"loss": 4.1141, |
|
"mean_token_accuracy": 0.8077200204133987, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 5.928807735443115, |
|
"learning_rate": 4.111111111111111e-05, |
|
"loss": 4.379, |
|
"mean_token_accuracy": 0.7994053959846497, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 6.193979740142822, |
|
"learning_rate": 4.101010101010101e-05, |
|
"loss": 4.0719, |
|
"mean_token_accuracy": 0.7951067984104156, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.613356113433838, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 3.9982, |
|
"mean_token_accuracy": 0.8044578582048416, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 5.828372478485107, |
|
"learning_rate": 4.0808080808080814e-05, |
|
"loss": 3.9207, |
|
"mean_token_accuracy": 0.804252415895462, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 6.8737993240356445, |
|
"learning_rate": 4.070707070707071e-05, |
|
"loss": 4.018, |
|
"mean_token_accuracy": 0.7971819192171097, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 6.014230251312256, |
|
"learning_rate": 4.0606060606060606e-05, |
|
"loss": 3.9449, |
|
"mean_token_accuracy": 0.8023800402879715, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 6.424180030822754, |
|
"learning_rate": 4.0505050505050506e-05, |
|
"loss": 4.133, |
|
"mean_token_accuracy": 0.7898548692464828, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5.941647529602051, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 4.4152, |
|
"mean_token_accuracy": 0.7944744974374771, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 5.532174110412598, |
|
"learning_rate": 4.0303030303030305e-05, |
|
"loss": 3.9543, |
|
"mean_token_accuracy": 0.8119575679302216, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 6.766019344329834, |
|
"learning_rate": 4.0202020202020204e-05, |
|
"loss": 3.8357, |
|
"mean_token_accuracy": 0.8240242004394531, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 5.829742908477783, |
|
"learning_rate": 4.01010101010101e-05, |
|
"loss": 4.1291, |
|
"mean_token_accuracy": 0.798798069357872, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 6.3326416015625, |
|
"learning_rate": 4e-05, |
|
"loss": 4.5605, |
|
"mean_token_accuracy": 0.7854688614606857, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.9824042320251465, |
|
"learning_rate": 3.98989898989899e-05, |
|
"loss": 4.0588, |
|
"mean_token_accuracy": 0.8019276112318039, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 6.245088577270508, |
|
"learning_rate": 3.97979797979798e-05, |
|
"loss": 3.9216, |
|
"mean_token_accuracy": 0.8176022469997406, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 6.343690395355225, |
|
"learning_rate": 3.96969696969697e-05, |
|
"loss": 4.5661, |
|
"mean_token_accuracy": 0.7900134176015854, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 6.176941394805908, |
|
"learning_rate": 3.9595959595959594e-05, |
|
"loss": 4.3509, |
|
"mean_token_accuracy": 0.7881277054548264, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 5.415067672729492, |
|
"learning_rate": 3.9494949494949494e-05, |
|
"loss": 3.9209, |
|
"mean_token_accuracy": 0.8084693402051926, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.951507568359375, |
|
"learning_rate": 3.939393939393939e-05, |
|
"loss": 3.8214, |
|
"mean_token_accuracy": 0.8184851258993149, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 6.3500213623046875, |
|
"learning_rate": 3.929292929292929e-05, |
|
"loss": 4.445, |
|
"mean_token_accuracy": 0.7920354455709457, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 6.108852386474609, |
|
"learning_rate": 3.91919191919192e-05, |
|
"loss": 3.9563, |
|
"mean_token_accuracy": 0.8090476542711258, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 5.749330043792725, |
|
"learning_rate": 3.909090909090909e-05, |
|
"loss": 3.7184, |
|
"mean_token_accuracy": 0.8214946985244751, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 6.137975692749023, |
|
"learning_rate": 3.898989898989899e-05, |
|
"loss": 3.8174, |
|
"mean_token_accuracy": 0.8351726979017258, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 5.827854156494141, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 4.1516, |
|
"mean_token_accuracy": 0.8008674532175064, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 6.231594562530518, |
|
"learning_rate": 3.878787878787879e-05, |
|
"loss": 3.9401, |
|
"mean_token_accuracy": 0.8067153543233871, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 6.3449578285217285, |
|
"learning_rate": 3.868686868686869e-05, |
|
"loss": 4.5777, |
|
"mean_token_accuracy": 0.7864043414592743, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 5.758909702301025, |
|
"learning_rate": 3.858585858585859e-05, |
|
"loss": 3.9313, |
|
"mean_token_accuracy": 0.8202812224626541, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 5.801164627075195, |
|
"learning_rate": 3.848484848484848e-05, |
|
"loss": 3.4612, |
|
"mean_token_accuracy": 0.8463838249444962, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 6.331445217132568, |
|
"learning_rate": 3.838383838383838e-05, |
|
"loss": 4.1784, |
|
"mean_token_accuracy": 0.8084011971950531, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 5.821787357330322, |
|
"learning_rate": 3.828282828282829e-05, |
|
"loss": 4.0625, |
|
"mean_token_accuracy": 0.8006434291601181, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 6.507475852966309, |
|
"learning_rate": 3.818181818181819e-05, |
|
"loss": 3.7078, |
|
"mean_token_accuracy": 0.8223373293876648, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 5.8519511222839355, |
|
"learning_rate": 3.8080808080808087e-05, |
|
"loss": 3.8093, |
|
"mean_token_accuracy": 0.8161063343286514, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 6.515493869781494, |
|
"learning_rate": 3.797979797979798e-05, |
|
"loss": 3.8881, |
|
"mean_token_accuracy": 0.8130738884210587, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.974099159240723, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 3.8128, |
|
"mean_token_accuracy": 0.817224845290184, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 5.158608436584473, |
|
"learning_rate": 3.777777777777778e-05, |
|
"loss": 3.5407, |
|
"mean_token_accuracy": 0.8298159688711166, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 5.91457462310791, |
|
"learning_rate": 3.767676767676768e-05, |
|
"loss": 3.6346, |
|
"mean_token_accuracy": 0.8155378848314285, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 5.550519943237305, |
|
"learning_rate": 3.757575757575758e-05, |
|
"loss": 3.8566, |
|
"mean_token_accuracy": 0.8141728788614273, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 5.610849380493164, |
|
"learning_rate": 3.747474747474748e-05, |
|
"loss": 3.785, |
|
"mean_token_accuracy": 0.811878427863121, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.2908854484558105, |
|
"learning_rate": 3.7373737373737376e-05, |
|
"loss": 3.0106, |
|
"mean_token_accuracy": 0.8458054810762405, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 5.267597675323486, |
|
"learning_rate": 3.7272727272727276e-05, |
|
"loss": 3.4967, |
|
"mean_token_accuracy": 0.8339269608259201, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 5.37116003036499, |
|
"learning_rate": 3.7171717171717175e-05, |
|
"loss": 3.4903, |
|
"mean_token_accuracy": 0.8220857381820679, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 5.198507785797119, |
|
"learning_rate": 3.7070707070707075e-05, |
|
"loss": 2.9143, |
|
"mean_token_accuracy": 0.8564625233411789, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 5.638608932495117, |
|
"learning_rate": 3.6969696969696974e-05, |
|
"loss": 3.6283, |
|
"mean_token_accuracy": 0.8230338096618652, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.533100605010986, |
|
"learning_rate": 3.686868686868687e-05, |
|
"loss": 3.3163, |
|
"mean_token_accuracy": 0.8364241421222687, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 5.794349670410156, |
|
"learning_rate": 3.6767676767676766e-05, |
|
"loss": 3.4776, |
|
"mean_token_accuracy": 0.822014644742012, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 6.230714797973633, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 3.9834, |
|
"mean_token_accuracy": 0.808947280049324, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 5.761725902557373, |
|
"learning_rate": 3.656565656565657e-05, |
|
"loss": 3.5205, |
|
"mean_token_accuracy": 0.8236398249864578, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 5.620357990264893, |
|
"learning_rate": 3.6464646464646465e-05, |
|
"loss": 3.7957, |
|
"mean_token_accuracy": 0.8184810876846313, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 5.732111930847168, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 3.3031, |
|
"mean_token_accuracy": 0.8268382251262665, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 5.512340545654297, |
|
"learning_rate": 3.6262626262626264e-05, |
|
"loss": 3.4623, |
|
"mean_token_accuracy": 0.8173917531967163, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 5.278458118438721, |
|
"learning_rate": 3.616161616161616e-05, |
|
"loss": 3.6187, |
|
"mean_token_accuracy": 0.8212647438049316, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 6.000738620758057, |
|
"learning_rate": 3.606060606060606e-05, |
|
"loss": 3.4098, |
|
"mean_token_accuracy": 0.8118827193975449, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 5.230849266052246, |
|
"learning_rate": 3.595959595959596e-05, |
|
"loss": 3.3312, |
|
"mean_token_accuracy": 0.8291827887296677, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 6.520547389984131, |
|
"learning_rate": 3.5858585858585855e-05, |
|
"loss": 2.9547, |
|
"mean_token_accuracy": 0.8410928547382355, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 6.905513286590576, |
|
"learning_rate": 3.575757575757576e-05, |
|
"loss": 3.3843, |
|
"mean_token_accuracy": 0.8292149305343628, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 5.25789213180542, |
|
"learning_rate": 3.565656565656566e-05, |
|
"loss": 3.0306, |
|
"mean_token_accuracy": 0.8420234769582748, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 6.301482677459717, |
|
"learning_rate": 3.555555555555556e-05, |
|
"loss": 3.563, |
|
"mean_token_accuracy": 0.816611647605896, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 5.18954610824585, |
|
"learning_rate": 3.545454545454546e-05, |
|
"loss": 3.3712, |
|
"mean_token_accuracy": 0.8429348319768906, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.924179553985596, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 3.0173, |
|
"mean_token_accuracy": 0.8484609872102737, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 6.125277996063232, |
|
"learning_rate": 3.525252525252525e-05, |
|
"loss": 4.017, |
|
"mean_token_accuracy": 0.8024349808692932, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 6.013985633850098, |
|
"learning_rate": 3.515151515151515e-05, |
|
"loss": 3.1488, |
|
"mean_token_accuracy": 0.8332614004611969, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 5.863841533660889, |
|
"learning_rate": 3.505050505050505e-05, |
|
"loss": 3.4748, |
|
"mean_token_accuracy": 0.8309570550918579, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 5.565553665161133, |
|
"learning_rate": 3.494949494949495e-05, |
|
"loss": 3.4478, |
|
"mean_token_accuracy": 0.8285814672708511, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.7123565673828125, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 3.1585, |
|
"mean_token_accuracy": 0.8400322198867798, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 5.499763488769531, |
|
"learning_rate": 3.474747474747475e-05, |
|
"loss": 2.9475, |
|
"mean_token_accuracy": 0.8378583043813705, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 5.909451961517334, |
|
"learning_rate": 3.464646464646465e-05, |
|
"loss": 3.0639, |
|
"mean_token_accuracy": 0.8469367325305939, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 6.213925838470459, |
|
"learning_rate": 3.454545454545455e-05, |
|
"loss": 3.7884, |
|
"mean_token_accuracy": 0.8231126517057419, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 5.21215295791626, |
|
"learning_rate": 3.444444444444445e-05, |
|
"loss": 3.4636, |
|
"mean_token_accuracy": 0.8168640285730362, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 5.759207725524902, |
|
"learning_rate": 3.434343434343435e-05, |
|
"loss": 3.2428, |
|
"mean_token_accuracy": 0.8419955372810364, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 5.337326526641846, |
|
"learning_rate": 3.424242424242424e-05, |
|
"loss": 3.5007, |
|
"mean_token_accuracy": 0.826006218791008, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 5.429465293884277, |
|
"learning_rate": 3.414141414141414e-05, |
|
"loss": 3.1902, |
|
"mean_token_accuracy": 0.8436289280653, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 5.5671234130859375, |
|
"learning_rate": 3.4040404040404045e-05, |
|
"loss": 3.1885, |
|
"mean_token_accuracy": 0.837678074836731, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 5.217177391052246, |
|
"learning_rate": 3.3939393939393945e-05, |
|
"loss": 3.0344, |
|
"mean_token_accuracy": 0.8463016897439957, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.01118803024292, |
|
"learning_rate": 3.3838383838383844e-05, |
|
"loss": 3.6496, |
|
"mean_token_accuracy": 0.8142215311527252, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 5.723564147949219, |
|
"learning_rate": 3.373737373737374e-05, |
|
"loss": 3.1859, |
|
"mean_token_accuracy": 0.8320183008909225, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 5.838883399963379, |
|
"learning_rate": 3.3636363636363636e-05, |
|
"loss": 3.4536, |
|
"mean_token_accuracy": 0.8264990895986557, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 5.59492301940918, |
|
"learning_rate": 3.3535353535353536e-05, |
|
"loss": 3.1706, |
|
"mean_token_accuracy": 0.8360193520784378, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 5.386695861816406, |
|
"learning_rate": 3.3434343434343435e-05, |
|
"loss": 2.8613, |
|
"mean_token_accuracy": 0.8595142513513565, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 5.8718791007995605, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 3.405, |
|
"mean_token_accuracy": 0.8177822679281235, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 5.533784866333008, |
|
"learning_rate": 3.3232323232323234e-05, |
|
"loss": 2.7535, |
|
"mean_token_accuracy": 0.847559779882431, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 6.221394062042236, |
|
"learning_rate": 3.3131313131313134e-05, |
|
"loss": 3.2758, |
|
"mean_token_accuracy": 0.8370038121938705, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 5.768967628479004, |
|
"learning_rate": 3.303030303030303e-05, |
|
"loss": 3.144, |
|
"mean_token_accuracy": 0.8520988523960114, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 5.794501781463623, |
|
"learning_rate": 3.292929292929293e-05, |
|
"loss": 3.294, |
|
"mean_token_accuracy": 0.8296584039926529, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 6.306114196777344, |
|
"learning_rate": 3.282828282828283e-05, |
|
"loss": 3.8122, |
|
"mean_token_accuracy": 0.8015681505203247, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 5.309399127960205, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 3.5656, |
|
"mean_token_accuracy": 0.8148599863052368, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 6.384012699127197, |
|
"learning_rate": 3.2626262626262624e-05, |
|
"loss": 3.3465, |
|
"mean_token_accuracy": 0.8153296113014221, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 5.76847505569458, |
|
"learning_rate": 3.2525252525252524e-05, |
|
"loss": 3.5117, |
|
"mean_token_accuracy": 0.8243328183889389, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 5.956698417663574, |
|
"learning_rate": 3.2424242424242423e-05, |
|
"loss": 3.3246, |
|
"mean_token_accuracy": 0.8309469819068909, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 5.436334133148193, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 3.3948, |
|
"mean_token_accuracy": 0.8304746299982071, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 6.713822364807129, |
|
"learning_rate": 3.222222222222223e-05, |
|
"loss": 3.7249, |
|
"mean_token_accuracy": 0.8197664022445679, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 5.69707727432251, |
|
"learning_rate": 3.212121212121212e-05, |
|
"loss": 3.4366, |
|
"mean_token_accuracy": 0.8290866911411285, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 5.674050807952881, |
|
"learning_rate": 3.202020202020202e-05, |
|
"loss": 3.4487, |
|
"mean_token_accuracy": 0.8316894471645355, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 6.27432918548584, |
|
"learning_rate": 3.191919191919192e-05, |
|
"loss": 3.7579, |
|
"mean_token_accuracy": 0.8203246891498566, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.963736534118652, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 3.3907, |
|
"mean_token_accuracy": 0.8244215101003647, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 6.205438137054443, |
|
"learning_rate": 3.171717171717172e-05, |
|
"loss": 3.2918, |
|
"mean_token_accuracy": 0.8321795910596848, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 6.026015281677246, |
|
"learning_rate": 3.161616161616161e-05, |
|
"loss": 3.7184, |
|
"mean_token_accuracy": 0.8272889703512192, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 5.637466907501221, |
|
"learning_rate": 3.151515151515151e-05, |
|
"loss": 2.5835, |
|
"mean_token_accuracy": 0.8698715269565582, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 6.474348545074463, |
|
"learning_rate": 3.141414141414142e-05, |
|
"loss": 3.2364, |
|
"mean_token_accuracy": 0.8249330222606659, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 6.4876885414123535, |
|
"learning_rate": 3.131313131313132e-05, |
|
"loss": 3.3135, |
|
"mean_token_accuracy": 0.8382684588432312, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 5.597368240356445, |
|
"learning_rate": 3.121212121212122e-05, |
|
"loss": 3.2368, |
|
"mean_token_accuracy": 0.8341715931892395, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 6.087445259094238, |
|
"learning_rate": 3.111111111111111e-05, |
|
"loss": 3.4584, |
|
"mean_token_accuracy": 0.8352274149656296, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 5.523479461669922, |
|
"learning_rate": 3.101010101010101e-05, |
|
"loss": 3.0476, |
|
"mean_token_accuracy": 0.8508585840463638, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 5.6211442947387695, |
|
"learning_rate": 3.090909090909091e-05, |
|
"loss": 3.2642, |
|
"mean_token_accuracy": 0.8426928371191025, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.739730358123779, |
|
"learning_rate": 3.080808080808081e-05, |
|
"loss": 3.6835, |
|
"mean_token_accuracy": 0.8226543515920639, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 6.00140380859375, |
|
"learning_rate": 3.070707070707071e-05, |
|
"loss": 2.9485, |
|
"mean_token_accuracy": 0.8533317595720291, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 5.541549205780029, |
|
"learning_rate": 3.060606060606061e-05, |
|
"loss": 3.0892, |
|
"mean_token_accuracy": 0.8474195152521133, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 5.923431873321533, |
|
"learning_rate": 3.050505050505051e-05, |
|
"loss": 3.018, |
|
"mean_token_accuracy": 0.84869784116745, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 6.204696178436279, |
|
"learning_rate": 3.0404040404040406e-05, |
|
"loss": 3.5126, |
|
"mean_token_accuracy": 0.8261184245347977, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 6.3025712966918945, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 3.4185, |
|
"mean_token_accuracy": 0.8210860043764114, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 5.689709663391113, |
|
"learning_rate": 3.0202020202020205e-05, |
|
"loss": 3.5322, |
|
"mean_token_accuracy": 0.8191230297088623, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 5.707929611206055, |
|
"learning_rate": 3.01010101010101e-05, |
|
"loss": 3.024, |
|
"mean_token_accuracy": 0.8488472253084183, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 5.80771017074585, |
|
"learning_rate": 3e-05, |
|
"loss": 3.5556, |
|
"mean_token_accuracy": 0.8266987651586533, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 6.03700590133667, |
|
"learning_rate": 2.98989898989899e-05, |
|
"loss": 3.1062, |
|
"mean_token_accuracy": 0.8308413475751877, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 6.733646392822266, |
|
"learning_rate": 2.9797979797979796e-05, |
|
"loss": 3.1995, |
|
"mean_token_accuracy": 0.8294665813446045, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 5.883334636688232, |
|
"learning_rate": 2.96969696969697e-05, |
|
"loss": 2.8124, |
|
"mean_token_accuracy": 0.8439156115055084, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 5.941483497619629, |
|
"learning_rate": 2.95959595959596e-05, |
|
"loss": 3.0179, |
|
"mean_token_accuracy": 0.8416165858507156, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 6.391957759857178, |
|
"learning_rate": 2.9494949494949498e-05, |
|
"loss": 3.7068, |
|
"mean_token_accuracy": 0.8122821152210236, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 6.237335205078125, |
|
"learning_rate": 2.9393939393939394e-05, |
|
"loss": 3.32, |
|
"mean_token_accuracy": 0.8313788622617722, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 6.5731987953186035, |
|
"learning_rate": 2.9292929292929294e-05, |
|
"loss": 3.5619, |
|
"mean_token_accuracy": 0.816206768155098, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 5.061030387878418, |
|
"learning_rate": 2.9191919191919193e-05, |
|
"loss": 2.8566, |
|
"mean_token_accuracy": 0.8475048393011093, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 5.921192646026611, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 3.3349, |
|
"mean_token_accuracy": 0.827596977353096, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 5.721929550170898, |
|
"learning_rate": 2.898989898989899e-05, |
|
"loss": 2.7363, |
|
"mean_token_accuracy": 0.8517429679632187, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 6.763914585113525, |
|
"learning_rate": 2.8888888888888888e-05, |
|
"loss": 2.8292, |
|
"mean_token_accuracy": 0.8498516529798508, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 6.817018032073975, |
|
"learning_rate": 2.878787878787879e-05, |
|
"loss": 3.2958, |
|
"mean_token_accuracy": 0.8348537087440491, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 5.659510612487793, |
|
"learning_rate": 2.868686868686869e-05, |
|
"loss": 2.8683, |
|
"mean_token_accuracy": 0.8435689210891724, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 6.099102020263672, |
|
"learning_rate": 2.8585858585858587e-05, |
|
"loss": 2.9825, |
|
"mean_token_accuracy": 0.8394478559494019, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 6.094569206237793, |
|
"learning_rate": 2.8484848484848486e-05, |
|
"loss": 3.2325, |
|
"mean_token_accuracy": 0.8246908932924271, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 6.866254806518555, |
|
"learning_rate": 2.8383838383838386e-05, |
|
"loss": 3.1284, |
|
"mean_token_accuracy": 0.8342044651508331, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 6.149699687957764, |
|
"learning_rate": 2.8282828282828282e-05, |
|
"loss": 3.6781, |
|
"mean_token_accuracy": 0.8211702555418015, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 5.862130641937256, |
|
"learning_rate": 2.818181818181818e-05, |
|
"loss": 3.3624, |
|
"mean_token_accuracy": 0.8324427157640457, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 5.395320415496826, |
|
"learning_rate": 2.808080808080808e-05, |
|
"loss": 3.3338, |
|
"mean_token_accuracy": 0.8399553894996643, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 5.550381183624268, |
|
"learning_rate": 2.7979797979797984e-05, |
|
"loss": 3.4036, |
|
"mean_token_accuracy": 0.8223441988229752, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 5.406087875366211, |
|
"learning_rate": 2.7878787878787883e-05, |
|
"loss": 2.6777, |
|
"mean_token_accuracy": 0.8600380420684814, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5.7635698318481445, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 3.5319, |
|
"mean_token_accuracy": 0.8255642205476761, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 5.934004783630371, |
|
"learning_rate": 2.767676767676768e-05, |
|
"loss": 3.2659, |
|
"mean_token_accuracy": 0.8305630534887314, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 6.8721184730529785, |
|
"learning_rate": 2.7575757575757578e-05, |
|
"loss": 3.1587, |
|
"mean_token_accuracy": 0.8299548774957657, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 5.556332588195801, |
|
"learning_rate": 2.7474747474747474e-05, |
|
"loss": 2.8912, |
|
"mean_token_accuracy": 0.8541744500398636, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 5.591052532196045, |
|
"learning_rate": 2.7373737373737374e-05, |
|
"loss": 3.1768, |
|
"mean_token_accuracy": 0.840098574757576, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 7.923620700836182, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 3.1165, |
|
"mean_token_accuracy": 0.8416318744421005, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 6.180293083190918, |
|
"learning_rate": 2.717171717171717e-05, |
|
"loss": 3.1878, |
|
"mean_token_accuracy": 0.8414299786090851, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 6.479142189025879, |
|
"learning_rate": 2.7070707070707075e-05, |
|
"loss": 3.519, |
|
"mean_token_accuracy": 0.8237949758768082, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 6.211218357086182, |
|
"learning_rate": 2.696969696969697e-05, |
|
"loss": 3.6602, |
|
"mean_token_accuracy": 0.8112812489271164, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 6.262876033782959, |
|
"learning_rate": 2.686868686868687e-05, |
|
"loss": 3.8658, |
|
"mean_token_accuracy": 0.8071164190769196, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 5.474344730377197, |
|
"learning_rate": 2.676767676767677e-05, |
|
"loss": 2.971, |
|
"mean_token_accuracy": 0.8377246409654617, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 6.193055629730225, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 3.478, |
|
"mean_token_accuracy": 0.823166161775589, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 5.627188682556152, |
|
"learning_rate": 2.6565656565656566e-05, |
|
"loss": 3.1995, |
|
"mean_token_accuracy": 0.8456520289182663, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 6.110049247741699, |
|
"learning_rate": 2.6464646464646466e-05, |
|
"loss": 3.0544, |
|
"mean_token_accuracy": 0.8367758989334106, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 5.3701982498168945, |
|
"learning_rate": 2.636363636363636e-05, |
|
"loss": 2.6659, |
|
"mean_token_accuracy": 0.8609540313482285, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.352710247039795, |
|
"learning_rate": 2.6262626262626268e-05, |
|
"loss": 3.082, |
|
"mean_token_accuracy": 0.842663049697876, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 6.08484411239624, |
|
"learning_rate": 2.6161616161616164e-05, |
|
"loss": 3.3737, |
|
"mean_token_accuracy": 0.8245035409927368, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 5.956201076507568, |
|
"learning_rate": 2.6060606060606063e-05, |
|
"loss": 2.9072, |
|
"mean_token_accuracy": 0.8548275828361511, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 5.499819278717041, |
|
"learning_rate": 2.5959595959595963e-05, |
|
"loss": 3.355, |
|
"mean_token_accuracy": 0.820908397436142, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 6.338292598724365, |
|
"learning_rate": 2.585858585858586e-05, |
|
"loss": 3.7158, |
|
"mean_token_accuracy": 0.8252478241920471, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 6.105760097503662, |
|
"learning_rate": 2.575757575757576e-05, |
|
"loss": 3.1973, |
|
"mean_token_accuracy": 0.8429799377918243, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 5.636971950531006, |
|
"learning_rate": 2.5656565656565658e-05, |
|
"loss": 2.6206, |
|
"mean_token_accuracy": 0.8573070019483566, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 6.419624328613281, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 3.5778, |
|
"mean_token_accuracy": 0.8265507072210312, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 5.554307460784912, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 2.9344, |
|
"mean_token_accuracy": 0.8515568971633911, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 6.059617519378662, |
|
"learning_rate": 2.5353535353535356e-05, |
|
"loss": 3.4458, |
|
"mean_token_accuracy": 0.8348643332719803, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.995134353637695, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 2.9078, |
|
"mean_token_accuracy": 0.8390886038541794, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.008, |
|
"grad_norm": 5.29079532623291, |
|
"learning_rate": 2.5151515151515155e-05, |
|
"loss": 2.5449, |
|
"mean_token_accuracy": 0.8623387068510056, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 6.307847499847412, |
|
"learning_rate": 2.505050505050505e-05, |
|
"loss": 2.6746, |
|
"mean_token_accuracy": 0.8547452688217163, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.024, |
|
"grad_norm": 5.258662700653076, |
|
"learning_rate": 2.494949494949495e-05, |
|
"loss": 2.8106, |
|
"mean_token_accuracy": 0.8465935438871384, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 5.160598278045654, |
|
"learning_rate": 2.4848484848484847e-05, |
|
"loss": 2.5327, |
|
"mean_token_accuracy": 0.8596736937761307, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 5.5139055252075195, |
|
"learning_rate": 2.474747474747475e-05, |
|
"loss": 2.5406, |
|
"mean_token_accuracy": 0.8599353432655334, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 5.596591949462891, |
|
"learning_rate": 2.464646464646465e-05, |
|
"loss": 2.5955, |
|
"mean_token_accuracy": 0.8636805862188339, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"grad_norm": 5.260463237762451, |
|
"learning_rate": 2.4545454545454545e-05, |
|
"loss": 2.8003, |
|
"mean_token_accuracy": 0.8472322225570679, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 5.485147476196289, |
|
"learning_rate": 2.4444444444444445e-05, |
|
"loss": 2.4907, |
|
"mean_token_accuracy": 0.8619499355554581, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.072, |
|
"grad_norm": 5.589687824249268, |
|
"learning_rate": 2.4343434343434344e-05, |
|
"loss": 2.563, |
|
"mean_token_accuracy": 0.8651385009288788, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 4.876934051513672, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 2.0974, |
|
"mean_token_accuracy": 0.8819858431816101, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"grad_norm": 6.266694068908691, |
|
"learning_rate": 2.4141414141414143e-05, |
|
"loss": 2.6164, |
|
"mean_token_accuracy": 0.8617752641439438, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 5.54473352432251, |
|
"learning_rate": 2.404040404040404e-05, |
|
"loss": 2.9005, |
|
"mean_token_accuracy": 0.8471231460571289, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.104, |
|
"grad_norm": 5.299022197723389, |
|
"learning_rate": 2.393939393939394e-05, |
|
"loss": 2.3437, |
|
"mean_token_accuracy": 0.872315376996994, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 5.8907856941223145, |
|
"learning_rate": 2.3838383838383842e-05, |
|
"loss": 2.5687, |
|
"mean_token_accuracy": 0.8683241903781891, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.84760046005249, |
|
"learning_rate": 2.3737373737373738e-05, |
|
"loss": 2.6167, |
|
"mean_token_accuracy": 0.8569298684597015, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 5.831202507019043, |
|
"learning_rate": 2.3636363636363637e-05, |
|
"loss": 3.0151, |
|
"mean_token_accuracy": 0.8395980596542358, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.136, |
|
"grad_norm": 5.698826313018799, |
|
"learning_rate": 2.3535353535353537e-05, |
|
"loss": 3.1071, |
|
"mean_token_accuracy": 0.850052535533905, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 6.112237930297852, |
|
"learning_rate": 2.3434343434343436e-05, |
|
"loss": 2.624, |
|
"mean_token_accuracy": 0.8661712259054184, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"grad_norm": 6.229471683502197, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 2.5656, |
|
"mean_token_accuracy": 0.8691338896751404, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 6.126317501068115, |
|
"learning_rate": 2.3232323232323232e-05, |
|
"loss": 2.6405, |
|
"mean_token_accuracy": 0.8534891903400421, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.168, |
|
"grad_norm": 6.28452205657959, |
|
"learning_rate": 2.313131313131313e-05, |
|
"loss": 2.6796, |
|
"mean_token_accuracy": 0.8531413078308105, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 5.863641262054443, |
|
"learning_rate": 2.3030303030303034e-05, |
|
"loss": 2.7015, |
|
"mean_token_accuracy": 0.8519039303064346, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"grad_norm": 6.535420894622803, |
|
"learning_rate": 2.292929292929293e-05, |
|
"loss": 3.0103, |
|
"mean_token_accuracy": 0.8415365815162659, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 5.875096321105957, |
|
"learning_rate": 2.282828282828283e-05, |
|
"loss": 2.8179, |
|
"mean_token_accuracy": 0.8494950979948044, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 5.35692834854126, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 2.1563, |
|
"mean_token_accuracy": 0.8810548335313797, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 6.05448579788208, |
|
"learning_rate": 2.262626262626263e-05, |
|
"loss": 2.4902, |
|
"mean_token_accuracy": 0.8656753301620483, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.216, |
|
"grad_norm": 5.888792037963867, |
|
"learning_rate": 2.2525252525252528e-05, |
|
"loss": 2.5769, |
|
"mean_token_accuracy": 0.8682566732168198, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 5.912112712860107, |
|
"learning_rate": 2.2424242424242424e-05, |
|
"loss": 2.5715, |
|
"mean_token_accuracy": 0.8543179333209991, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.232, |
|
"grad_norm": 6.485673427581787, |
|
"learning_rate": 2.2323232323232324e-05, |
|
"loss": 2.6801, |
|
"mean_token_accuracy": 0.8597021549940109, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 5.757596969604492, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 2.5495, |
|
"mean_token_accuracy": 0.8788398951292038, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"grad_norm": 5.577345848083496, |
|
"learning_rate": 2.2121212121212123e-05, |
|
"loss": 2.4548, |
|
"mean_token_accuracy": 0.8728667497634888, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 8.356124877929688, |
|
"learning_rate": 2.2020202020202022e-05, |
|
"loss": 2.9846, |
|
"mean_token_accuracy": 0.8483117371797562, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.2640000000000002, |
|
"grad_norm": 5.887564659118652, |
|
"learning_rate": 2.191919191919192e-05, |
|
"loss": 2.7833, |
|
"mean_token_accuracy": 0.8488295823335648, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 6.558447360992432, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 2.3944, |
|
"mean_token_accuracy": 0.8682046681642532, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 6.828135013580322, |
|
"learning_rate": 2.171717171717172e-05, |
|
"loss": 2.8403, |
|
"mean_token_accuracy": 0.8527731895446777, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 5.772212505340576, |
|
"learning_rate": 2.1616161616161617e-05, |
|
"loss": 2.8966, |
|
"mean_token_accuracy": 0.8490441888570786, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.296, |
|
"grad_norm": 5.538363933563232, |
|
"learning_rate": 2.1515151515151516e-05, |
|
"loss": 2.4595, |
|
"mean_token_accuracy": 0.8743099421262741, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 6.172807216644287, |
|
"learning_rate": 2.1414141414141416e-05, |
|
"loss": 2.3328, |
|
"mean_token_accuracy": 0.8704792261123657, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.312, |
|
"grad_norm": 6.232668399810791, |
|
"learning_rate": 2.1313131313131315e-05, |
|
"loss": 2.9026, |
|
"mean_token_accuracy": 0.8546594232320786, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 6.070257663726807, |
|
"learning_rate": 2.1212121212121215e-05, |
|
"loss": 2.4686, |
|
"mean_token_accuracy": 0.8643316328525543, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.328, |
|
"grad_norm": 5.737977504730225, |
|
"learning_rate": 2.111111111111111e-05, |
|
"loss": 2.9917, |
|
"mean_token_accuracy": 0.8514521420001984, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 6.409512519836426, |
|
"learning_rate": 2.101010101010101e-05, |
|
"loss": 2.6572, |
|
"mean_token_accuracy": 0.8566225320100784, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"grad_norm": 6.57427978515625, |
|
"learning_rate": 2.090909090909091e-05, |
|
"loss": 2.466, |
|
"mean_token_accuracy": 0.853950634598732, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 6.017867088317871, |
|
"learning_rate": 2.080808080808081e-05, |
|
"loss": 2.8763, |
|
"mean_token_accuracy": 0.8532497733831406, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 5.640237808227539, |
|
"learning_rate": 2.070707070707071e-05, |
|
"loss": 2.6013, |
|
"mean_token_accuracy": 0.8739284723997116, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 6.341038703918457, |
|
"learning_rate": 2.0606060606060608e-05, |
|
"loss": 2.6485, |
|
"mean_token_accuracy": 0.8521928191184998, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"grad_norm": 6.142149925231934, |
|
"learning_rate": 2.0505050505050504e-05, |
|
"loss": 2.3397, |
|
"mean_token_accuracy": 0.8850863575935364, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 6.308354377746582, |
|
"learning_rate": 2.0404040404040407e-05, |
|
"loss": 2.5977, |
|
"mean_token_accuracy": 0.8648134768009186, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.392, |
|
"grad_norm": 5.566152572631836, |
|
"learning_rate": 2.0303030303030303e-05, |
|
"loss": 2.7123, |
|
"mean_token_accuracy": 0.8633100241422653, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 5.725775241851807, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 2.6108, |
|
"mean_token_accuracy": 0.8738928139209747, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.408, |
|
"grad_norm": 6.435664176940918, |
|
"learning_rate": 2.0101010101010102e-05, |
|
"loss": 2.7976, |
|
"mean_token_accuracy": 0.8510608673095703, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 7.4542622566223145, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8097, |
|
"mean_token_accuracy": 0.8516113609075546, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.424, |
|
"grad_norm": 5.655482769012451, |
|
"learning_rate": 1.98989898989899e-05, |
|
"loss": 2.3856, |
|
"mean_token_accuracy": 0.8649388700723648, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 6.412247180938721, |
|
"learning_rate": 1.9797979797979797e-05, |
|
"loss": 2.8675, |
|
"mean_token_accuracy": 0.8470883667469025, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 7.186456680297852, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 3.1156, |
|
"mean_token_accuracy": 0.8328486531972885, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 5.615805625915527, |
|
"learning_rate": 1.95959595959596e-05, |
|
"loss": 2.6565, |
|
"mean_token_accuracy": 0.8666907250881195, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.456, |
|
"grad_norm": 6.449917793273926, |
|
"learning_rate": 1.9494949494949496e-05, |
|
"loss": 2.9544, |
|
"mean_token_accuracy": 0.8483339697122574, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 5.747985363006592, |
|
"learning_rate": 1.9393939393939395e-05, |
|
"loss": 2.1422, |
|
"mean_token_accuracy": 0.8723134845495224, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"grad_norm": 6.027318000793457, |
|
"learning_rate": 1.9292929292929295e-05, |
|
"loss": 2.5305, |
|
"mean_token_accuracy": 0.8660032451152802, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 7.454213619232178, |
|
"learning_rate": 1.919191919191919e-05, |
|
"loss": 2.5679, |
|
"mean_token_accuracy": 0.8533085733652115, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.488, |
|
"grad_norm": 5.4801716804504395, |
|
"learning_rate": 1.9090909090909094e-05, |
|
"loss": 2.6006, |
|
"mean_token_accuracy": 0.8550811111927032, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 6.020359039306641, |
|
"learning_rate": 1.898989898989899e-05, |
|
"loss": 2.2521, |
|
"mean_token_accuracy": 0.8731967061758041, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.504, |
|
"grad_norm": 5.78825569152832, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 2.4461, |
|
"mean_token_accuracy": 0.8596260696649551, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 5.771088123321533, |
|
"learning_rate": 1.878787878787879e-05, |
|
"loss": 2.3915, |
|
"mean_token_accuracy": 0.8663473874330521, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 5.939653396606445, |
|
"learning_rate": 1.8686868686868688e-05, |
|
"loss": 2.7159, |
|
"mean_token_accuracy": 0.8722144514322281, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 6.489267349243164, |
|
"learning_rate": 1.8585858585858588e-05, |
|
"loss": 2.6137, |
|
"mean_token_accuracy": 0.8580323159694672, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"grad_norm": 5.97398042678833, |
|
"learning_rate": 1.8484848484848487e-05, |
|
"loss": 2.7352, |
|
"mean_token_accuracy": 0.8655538409948349, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 5.858009338378906, |
|
"learning_rate": 1.8383838383838383e-05, |
|
"loss": 2.2991, |
|
"mean_token_accuracy": 0.8808925747871399, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.552, |
|
"grad_norm": 5.720334053039551, |
|
"learning_rate": 1.8282828282828286e-05, |
|
"loss": 2.6282, |
|
"mean_token_accuracy": 0.8669092357158661, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 6.569767475128174, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 2.5663, |
|
"mean_token_accuracy": 0.8692184388637543, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.568, |
|
"grad_norm": 6.509884357452393, |
|
"learning_rate": 1.808080808080808e-05, |
|
"loss": 2.8698, |
|
"mean_token_accuracy": 0.8457562029361725, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 5.976521968841553, |
|
"learning_rate": 1.797979797979798e-05, |
|
"loss": 2.7501, |
|
"mean_token_accuracy": 0.846638560295105, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.584, |
|
"grad_norm": 5.878550052642822, |
|
"learning_rate": 1.787878787878788e-05, |
|
"loss": 2.2021, |
|
"mean_token_accuracy": 0.8834634870290756, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 5.905157566070557, |
|
"learning_rate": 1.777777777777778e-05, |
|
"loss": 2.5808, |
|
"mean_token_accuracy": 0.8616163432598114, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 5.510688304901123, |
|
"learning_rate": 1.7676767676767676e-05, |
|
"loss": 2.265, |
|
"mean_token_accuracy": 0.8738918304443359, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 5.952402591705322, |
|
"learning_rate": 1.7575757575757576e-05, |
|
"loss": 2.7743, |
|
"mean_token_accuracy": 0.85427226126194, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.616, |
|
"grad_norm": 6.647154808044434, |
|
"learning_rate": 1.7474747474747475e-05, |
|
"loss": 2.9181, |
|
"mean_token_accuracy": 0.8349238932132721, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 7.24924898147583, |
|
"learning_rate": 1.7373737373737375e-05, |
|
"loss": 3.1174, |
|
"mean_token_accuracy": 0.8411069959402084, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.632, |
|
"grad_norm": 5.238758563995361, |
|
"learning_rate": 1.7272727272727274e-05, |
|
"loss": 2.4068, |
|
"mean_token_accuracy": 0.8669510632753372, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 5.997193336486816, |
|
"learning_rate": 1.7171717171717173e-05, |
|
"loss": 3.0963, |
|
"mean_token_accuracy": 0.8448814451694489, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.648, |
|
"grad_norm": 6.348751544952393, |
|
"learning_rate": 1.707070707070707e-05, |
|
"loss": 2.8593, |
|
"mean_token_accuracy": 0.8491098284721375, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 5.824342250823975, |
|
"learning_rate": 1.6969696969696972e-05, |
|
"loss": 2.5448, |
|
"mean_token_accuracy": 0.8616123348474503, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.664, |
|
"grad_norm": 7.513331890106201, |
|
"learning_rate": 1.686868686868687e-05, |
|
"loss": 2.5108, |
|
"mean_token_accuracy": 0.8665321916341782, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 5.547114372253418, |
|
"learning_rate": 1.6767676767676768e-05, |
|
"loss": 2.674, |
|
"mean_token_accuracy": 0.8699924349784851, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 6.60705041885376, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.9442, |
|
"mean_token_accuracy": 0.8435302972793579, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 6.116333961486816, |
|
"learning_rate": 1.6565656565656567e-05, |
|
"loss": 2.3428, |
|
"mean_token_accuracy": 0.8697264492511749, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.6959999999999997, |
|
"grad_norm": 6.642714500427246, |
|
"learning_rate": 1.6464646464646466e-05, |
|
"loss": 2.7629, |
|
"mean_token_accuracy": 0.863296702504158, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 6.764316558837891, |
|
"learning_rate": 1.6363636363636366e-05, |
|
"loss": 3.0041, |
|
"mean_token_accuracy": 0.8556036204099655, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.7119999999999997, |
|
"grad_norm": 5.682474613189697, |
|
"learning_rate": 1.6262626262626262e-05, |
|
"loss": 2.4677, |
|
"mean_token_accuracy": 0.8652613461017609, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 6.347187519073486, |
|
"learning_rate": 1.6161616161616165e-05, |
|
"loss": 2.4674, |
|
"mean_token_accuracy": 0.8690385818481445, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.7279999999999998, |
|
"grad_norm": 5.507145404815674, |
|
"learning_rate": 1.606060606060606e-05, |
|
"loss": 2.692, |
|
"mean_token_accuracy": 0.8610135316848755, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 6.108325481414795, |
|
"learning_rate": 1.595959595959596e-05, |
|
"loss": 2.4909, |
|
"mean_token_accuracy": 0.8648791164159775, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.7439999999999998, |
|
"grad_norm": 6.017334938049316, |
|
"learning_rate": 1.585858585858586e-05, |
|
"loss": 2.2442, |
|
"mean_token_accuracy": 0.8753398060798645, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 6.193384170532227, |
|
"learning_rate": 1.5757575757575756e-05, |
|
"loss": 2.3729, |
|
"mean_token_accuracy": 0.8614612221717834, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 6.495743751525879, |
|
"learning_rate": 1.565656565656566e-05, |
|
"loss": 2.8584, |
|
"mean_token_accuracy": 0.8486870527267456, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 5.537295818328857, |
|
"learning_rate": 1.5555555555555555e-05, |
|
"loss": 2.3311, |
|
"mean_token_accuracy": 0.8707073032855988, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.776, |
|
"grad_norm": 5.175790309906006, |
|
"learning_rate": 1.5454545454545454e-05, |
|
"loss": 2.4548, |
|
"mean_token_accuracy": 0.8665929436683655, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 5.646080017089844, |
|
"learning_rate": 1.5353535353535354e-05, |
|
"loss": 2.4042, |
|
"mean_token_accuracy": 0.8688449859619141, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.792, |
|
"grad_norm": 5.514770984649658, |
|
"learning_rate": 1.5252525252525255e-05, |
|
"loss": 2.3592, |
|
"mean_token_accuracy": 0.868006706237793, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 5.443539619445801, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 2.7749, |
|
"mean_token_accuracy": 0.8517529368400574, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.808, |
|
"grad_norm": 6.177525043487549, |
|
"learning_rate": 1.505050505050505e-05, |
|
"loss": 2.7422, |
|
"mean_token_accuracy": 0.853731244802475, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 5.842599868774414, |
|
"learning_rate": 1.494949494949495e-05, |
|
"loss": 2.7828, |
|
"mean_token_accuracy": 0.8474886268377304, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.824, |
|
"grad_norm": 6.956360816955566, |
|
"learning_rate": 1.484848484848485e-05, |
|
"loss": 2.6584, |
|
"mean_token_accuracy": 0.8608374446630478, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 6.075902938842773, |
|
"learning_rate": 1.4747474747474749e-05, |
|
"loss": 1.7885, |
|
"mean_token_accuracy": 0.8908968865871429, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 5.676952838897705, |
|
"learning_rate": 1.4646464646464647e-05, |
|
"loss": 2.2233, |
|
"mean_token_accuracy": 0.8666234165430069, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 6.072289943695068, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 2.9788, |
|
"mean_token_accuracy": 0.8446744084358215, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.856, |
|
"grad_norm": 6.316305637359619, |
|
"learning_rate": 1.4444444444444444e-05, |
|
"loss": 2.6826, |
|
"mean_token_accuracy": 0.8572750687599182, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 6.896461486816406, |
|
"learning_rate": 1.4343434343434345e-05, |
|
"loss": 2.3163, |
|
"mean_token_accuracy": 0.874391183257103, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.872, |
|
"grad_norm": 6.410128116607666, |
|
"learning_rate": 1.4242424242424243e-05, |
|
"loss": 2.66, |
|
"mean_token_accuracy": 0.8778804391622543, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 5.7101664543151855, |
|
"learning_rate": 1.4141414141414141e-05, |
|
"loss": 2.5463, |
|
"mean_token_accuracy": 0.8713642507791519, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.888, |
|
"grad_norm": 5.941382884979248, |
|
"learning_rate": 1.404040404040404e-05, |
|
"loss": 2.4611, |
|
"mean_token_accuracy": 0.8689820915460587, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 5.766918182373047, |
|
"learning_rate": 1.3939393939393942e-05, |
|
"loss": 2.3588, |
|
"mean_token_accuracy": 0.8715604543685913, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.904, |
|
"grad_norm": 6.412060260772705, |
|
"learning_rate": 1.383838383838384e-05, |
|
"loss": 2.9418, |
|
"mean_token_accuracy": 0.8422341346740723, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 6.339536666870117, |
|
"learning_rate": 1.3737373737373737e-05, |
|
"loss": 2.8499, |
|
"mean_token_accuracy": 0.8519201278686523, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 5.80408239364624, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 2.6526, |
|
"mean_token_accuracy": 0.8615650832653046, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 5.643406391143799, |
|
"learning_rate": 1.3535353535353538e-05, |
|
"loss": 2.4131, |
|
"mean_token_accuracy": 0.8788148909807205, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.936, |
|
"grad_norm": 6.2641167640686035, |
|
"learning_rate": 1.3434343434343436e-05, |
|
"loss": 2.7667, |
|
"mean_token_accuracy": 0.8569764792919159, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 6.191479682922363, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.3207, |
|
"mean_token_accuracy": 0.8694847077131271, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.952, |
|
"grad_norm": 5.668253421783447, |
|
"learning_rate": 1.3232323232323233e-05, |
|
"loss": 2.4538, |
|
"mean_token_accuracy": 0.8613507598638535, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 5.68297004699707, |
|
"learning_rate": 1.3131313131313134e-05, |
|
"loss": 2.4336, |
|
"mean_token_accuracy": 0.8691912293434143, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.968, |
|
"grad_norm": 6.308213710784912, |
|
"learning_rate": 1.3030303030303032e-05, |
|
"loss": 2.9029, |
|
"mean_token_accuracy": 0.8455093801021576, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 5.907562255859375, |
|
"learning_rate": 1.292929292929293e-05, |
|
"loss": 2.4898, |
|
"mean_token_accuracy": 0.8742183148860931, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.984, |
|
"grad_norm": 6.077236652374268, |
|
"learning_rate": 1.2828282828282829e-05, |
|
"loss": 2.5998, |
|
"mean_token_accuracy": 0.8597747683525085, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 5.781510829925537, |
|
"learning_rate": 1.2727272727272727e-05, |
|
"loss": 2.5208, |
|
"mean_token_accuracy": 0.8634484708309174, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 6.007801532745361, |
|
"learning_rate": 1.2626262626262628e-05, |
|
"loss": 2.5073, |
|
"mean_token_accuracy": 0.8571873605251312, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.008, |
|
"grad_norm": 5.24554443359375, |
|
"learning_rate": 1.2525252525252526e-05, |
|
"loss": 2.189, |
|
"mean_token_accuracy": 0.8963351398706436, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.016, |
|
"grad_norm": 5.7106614112854, |
|
"learning_rate": 1.2424242424242424e-05, |
|
"loss": 2.1298, |
|
"mean_token_accuracy": 0.8783598244190216, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.024, |
|
"grad_norm": 5.2301025390625, |
|
"learning_rate": 1.2323232323232325e-05, |
|
"loss": 1.9574, |
|
"mean_token_accuracy": 0.8882095962762833, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.032, |
|
"grad_norm": 5.160372257232666, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 2.315, |
|
"mean_token_accuracy": 0.8809339702129364, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 5.574481010437012, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 2.1501, |
|
"mean_token_accuracy": 0.8889680802822113, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.048, |
|
"grad_norm": 6.913398265838623, |
|
"learning_rate": 1.202020202020202e-05, |
|
"loss": 2.3129, |
|
"mean_token_accuracy": 0.8849718272686005, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.056, |
|
"grad_norm": 6.229403972625732, |
|
"learning_rate": 1.1919191919191921e-05, |
|
"loss": 2.3757, |
|
"mean_token_accuracy": 0.8662046045064926, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.064, |
|
"grad_norm": 5.60806131362915, |
|
"learning_rate": 1.1818181818181819e-05, |
|
"loss": 2.6174, |
|
"mean_token_accuracy": 0.8517654091119766, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 5.600075721740723, |
|
"learning_rate": 1.1717171717171718e-05, |
|
"loss": 2.3305, |
|
"mean_token_accuracy": 0.8705442994832993, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 6.062666416168213, |
|
"learning_rate": 1.1616161616161616e-05, |
|
"loss": 2.2109, |
|
"mean_token_accuracy": 0.8755911886692047, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.088, |
|
"grad_norm": 7.154458999633789, |
|
"learning_rate": 1.1515151515151517e-05, |
|
"loss": 2.6082, |
|
"mean_token_accuracy": 0.860401377081871, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.096, |
|
"grad_norm": 6.247750759124756, |
|
"learning_rate": 1.1414141414141415e-05, |
|
"loss": 2.2877, |
|
"mean_token_accuracy": 0.880204901099205, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.104, |
|
"grad_norm": 5.955714702606201, |
|
"learning_rate": 1.1313131313131314e-05, |
|
"loss": 1.8738, |
|
"mean_token_accuracy": 0.9012208431959152, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.112, |
|
"grad_norm": 5.796663284301758, |
|
"learning_rate": 1.1212121212121212e-05, |
|
"loss": 1.6587, |
|
"mean_token_accuracy": 0.9029347151517868, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 5.392340660095215, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 2.3056, |
|
"mean_token_accuracy": 0.8753447830677032, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.128, |
|
"grad_norm": 7.139640808105469, |
|
"learning_rate": 1.1010101010101011e-05, |
|
"loss": 2.8439, |
|
"mean_token_accuracy": 0.8585825264453888, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.136, |
|
"grad_norm": 5.9656829833984375, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 2.1093, |
|
"mean_token_accuracy": 0.8788597285747528, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.144, |
|
"grad_norm": 5.839116096496582, |
|
"learning_rate": 1.0808080808080808e-05, |
|
"loss": 2.2813, |
|
"mean_token_accuracy": 0.885636180639267, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.152, |
|
"grad_norm": 5.032593727111816, |
|
"learning_rate": 1.0707070707070708e-05, |
|
"loss": 1.9325, |
|
"mean_token_accuracy": 0.8939605355262756, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 6.447021961212158, |
|
"learning_rate": 1.0606060606060607e-05, |
|
"loss": 2.0899, |
|
"mean_token_accuracy": 0.8907236605882645, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.168, |
|
"grad_norm": 6.839473724365234, |
|
"learning_rate": 1.0505050505050505e-05, |
|
"loss": 1.8443, |
|
"mean_token_accuracy": 0.8959701657295227, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.176, |
|
"grad_norm": 6.076663017272949, |
|
"learning_rate": 1.0404040404040405e-05, |
|
"loss": 2.0503, |
|
"mean_token_accuracy": 0.8750788569450378, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.184, |
|
"grad_norm": 5.917912483215332, |
|
"learning_rate": 1.0303030303030304e-05, |
|
"loss": 2.1561, |
|
"mean_token_accuracy": 0.8795887529850006, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.192, |
|
"grad_norm": 6.047354698181152, |
|
"learning_rate": 1.0202020202020204e-05, |
|
"loss": 2.4048, |
|
"mean_token_accuracy": 0.8771449476480484, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 6.2761125564575195, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 2.0975, |
|
"mean_token_accuracy": 0.8842770159244537, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.208, |
|
"grad_norm": 6.07623291015625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8668, |
|
"mean_token_accuracy": 0.8863209933042526, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.216, |
|
"grad_norm": 6.246818542480469, |
|
"learning_rate": 9.898989898989899e-06, |
|
"loss": 2.245, |
|
"mean_token_accuracy": 0.8780923783779144, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.224, |
|
"grad_norm": 6.323997974395752, |
|
"learning_rate": 9.7979797979798e-06, |
|
"loss": 2.1369, |
|
"mean_token_accuracy": 0.8779752850532532, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.232, |
|
"grad_norm": 7.1778411865234375, |
|
"learning_rate": 9.696969696969698e-06, |
|
"loss": 2.5985, |
|
"mean_token_accuracy": 0.8683747202157974, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 6.361847877502441, |
|
"learning_rate": 9.595959595959595e-06, |
|
"loss": 2.3375, |
|
"mean_token_accuracy": 0.8720128089189529, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.248, |
|
"grad_norm": 6.138617992401123, |
|
"learning_rate": 9.494949494949495e-06, |
|
"loss": 2.0752, |
|
"mean_token_accuracy": 0.8752965927124023, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.2560000000000002, |
|
"grad_norm": 5.688358783721924, |
|
"learning_rate": 9.393939393939394e-06, |
|
"loss": 1.9569, |
|
"mean_token_accuracy": 0.8975227028131485, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.2640000000000002, |
|
"grad_norm": 5.894192218780518, |
|
"learning_rate": 9.292929292929294e-06, |
|
"loss": 2.0029, |
|
"mean_token_accuracy": 0.886854737997055, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.2720000000000002, |
|
"grad_norm": 6.036264419555664, |
|
"learning_rate": 9.191919191919192e-06, |
|
"loss": 2.2071, |
|
"mean_token_accuracy": 0.8780567795038223, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 5.998513221740723, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.9301, |
|
"mean_token_accuracy": 0.8917412608861923, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.288, |
|
"grad_norm": 6.909607887268066, |
|
"learning_rate": 8.98989898989899e-06, |
|
"loss": 2.1684, |
|
"mean_token_accuracy": 0.871549054980278, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.296, |
|
"grad_norm": 6.868885040283203, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 2.3904, |
|
"mean_token_accuracy": 0.8834272921085358, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.304, |
|
"grad_norm": 6.619902610778809, |
|
"learning_rate": 8.787878787878788e-06, |
|
"loss": 2.3145, |
|
"mean_token_accuracy": 0.8664613366127014, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.312, |
|
"grad_norm": 7.675636291503906, |
|
"learning_rate": 8.686868686868687e-06, |
|
"loss": 2.4144, |
|
"mean_token_accuracy": 0.8673148602247238, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 6.839119911193848, |
|
"learning_rate": 8.585858585858587e-06, |
|
"loss": 2.445, |
|
"mean_token_accuracy": 0.8668957501649857, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.328, |
|
"grad_norm": 6.37800407409668, |
|
"learning_rate": 8.484848484848486e-06, |
|
"loss": 2.0239, |
|
"mean_token_accuracy": 0.8818509876728058, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.336, |
|
"grad_norm": 6.840362071990967, |
|
"learning_rate": 8.383838383838384e-06, |
|
"loss": 2.2702, |
|
"mean_token_accuracy": 0.8701023757457733, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.344, |
|
"grad_norm": 6.7885003089904785, |
|
"learning_rate": 8.282828282828283e-06, |
|
"loss": 2.4609, |
|
"mean_token_accuracy": 0.8644936680793762, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.352, |
|
"grad_norm": 7.857728481292725, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 1.8772, |
|
"mean_token_accuracy": 0.8961075842380524, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 6.570309162139893, |
|
"learning_rate": 8.080808080808082e-06, |
|
"loss": 1.8877, |
|
"mean_token_accuracy": 0.8875200748443604, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.368, |
|
"grad_norm": 6.203190803527832, |
|
"learning_rate": 7.97979797979798e-06, |
|
"loss": 2.529, |
|
"mean_token_accuracy": 0.8654111176729202, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.376, |
|
"grad_norm": 5.97314977645874, |
|
"learning_rate": 7.878787878787878e-06, |
|
"loss": 1.9645, |
|
"mean_token_accuracy": 0.8833965063095093, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.384, |
|
"grad_norm": 6.25310754776001, |
|
"learning_rate": 7.777777777777777e-06, |
|
"loss": 2.2233, |
|
"mean_token_accuracy": 0.8773595839738846, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.392, |
|
"grad_norm": 7.28585958480835, |
|
"learning_rate": 7.676767676767677e-06, |
|
"loss": 2.0955, |
|
"mean_token_accuracy": 0.8743046373128891, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 5.4623260498046875, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 2.0769, |
|
"mean_token_accuracy": 0.886017695069313, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.408, |
|
"grad_norm": 6.525033950805664, |
|
"learning_rate": 7.474747474747475e-06, |
|
"loss": 1.8359, |
|
"mean_token_accuracy": 0.8860061317682266, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.416, |
|
"grad_norm": 6.582717418670654, |
|
"learning_rate": 7.3737373737373745e-06, |
|
"loss": 1.9729, |
|
"mean_token_accuracy": 0.8987778276205063, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.424, |
|
"grad_norm": 6.132490634918213, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 1.9637, |
|
"mean_token_accuracy": 0.883312463760376, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.432, |
|
"grad_norm": 5.695188045501709, |
|
"learning_rate": 7.171717171717173e-06, |
|
"loss": 2.0213, |
|
"mean_token_accuracy": 0.8891171365976334, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 8.365019798278809, |
|
"learning_rate": 7.0707070707070704e-06, |
|
"loss": 2.4396, |
|
"mean_token_accuracy": 0.8708128333091736, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.448, |
|
"grad_norm": 5.691287040710449, |
|
"learning_rate": 6.969696969696971e-06, |
|
"loss": 2.0038, |
|
"mean_token_accuracy": 0.9007010161876678, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.456, |
|
"grad_norm": 6.807380676269531, |
|
"learning_rate": 6.8686868686868685e-06, |
|
"loss": 1.9772, |
|
"mean_token_accuracy": 0.8897037208080292, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.464, |
|
"grad_norm": 6.571257591247559, |
|
"learning_rate": 6.767676767676769e-06, |
|
"loss": 2.5581, |
|
"mean_token_accuracy": 0.8635726571083069, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.472, |
|
"grad_norm": 5.491265296936035, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.9038, |
|
"mean_token_accuracy": 0.9004794210195541, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 5.837332725524902, |
|
"learning_rate": 6.565656565656567e-06, |
|
"loss": 1.8962, |
|
"mean_token_accuracy": 0.890899047255516, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.488, |
|
"grad_norm": 6.126434803009033, |
|
"learning_rate": 6.464646464646465e-06, |
|
"loss": 2.0859, |
|
"mean_token_accuracy": 0.8814673125743866, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.496, |
|
"grad_norm": 5.65269660949707, |
|
"learning_rate": 6.363636363636363e-06, |
|
"loss": 1.7333, |
|
"mean_token_accuracy": 0.900140568614006, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.504, |
|
"grad_norm": 6.0961079597473145, |
|
"learning_rate": 6.262626262626263e-06, |
|
"loss": 1.9624, |
|
"mean_token_accuracy": 0.8910833448171616, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.512, |
|
"grad_norm": 6.404435157775879, |
|
"learning_rate": 6.161616161616162e-06, |
|
"loss": 2.3729, |
|
"mean_token_accuracy": 0.8687343299388885, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 6.7053093910217285, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 2.2333, |
|
"mean_token_accuracy": 0.8874952048063278, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.528, |
|
"grad_norm": 6.296670436859131, |
|
"learning_rate": 5.9595959595959605e-06, |
|
"loss": 2.1395, |
|
"mean_token_accuracy": 0.8732293993234634, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.536, |
|
"grad_norm": 6.392675399780273, |
|
"learning_rate": 5.858585858585859e-06, |
|
"loss": 2.0524, |
|
"mean_token_accuracy": 0.8836774080991745, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.544, |
|
"grad_norm": 6.301563262939453, |
|
"learning_rate": 5.7575757575757586e-06, |
|
"loss": 1.8739, |
|
"mean_token_accuracy": 0.8968744426965714, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.552, |
|
"grad_norm": 6.225803852081299, |
|
"learning_rate": 5.656565656565657e-06, |
|
"loss": 2.0003, |
|
"mean_token_accuracy": 0.8827953040599823, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 6.4414801597595215, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 2.1538, |
|
"mean_token_accuracy": 0.8831271827220917, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.568, |
|
"grad_norm": 8.136292457580566, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 1.7717, |
|
"mean_token_accuracy": 0.8966427743434906, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.576, |
|
"grad_norm": 6.468795299530029, |
|
"learning_rate": 5.353535353535354e-06, |
|
"loss": 1.9399, |
|
"mean_token_accuracy": 0.8909775465726852, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.584, |
|
"grad_norm": 6.0381083488464355, |
|
"learning_rate": 5.2525252525252526e-06, |
|
"loss": 2.0466, |
|
"mean_token_accuracy": 0.8934204578399658, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.592, |
|
"grad_norm": 6.083271026611328, |
|
"learning_rate": 5.151515151515152e-06, |
|
"loss": 2.1554, |
|
"mean_token_accuracy": 0.88754041492939, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 6.717824935913086, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 2.1326, |
|
"mean_token_accuracy": 0.8911218792200089, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.608, |
|
"grad_norm": 6.14173698425293, |
|
"learning_rate": 4.949494949494949e-06, |
|
"loss": 1.6302, |
|
"mean_token_accuracy": 0.9065367430448532, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.616, |
|
"grad_norm": 6.943065643310547, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 2.6283, |
|
"mean_token_accuracy": 0.8691755533218384, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.624, |
|
"grad_norm": 6.845998287200928, |
|
"learning_rate": 4.747474747474747e-06, |
|
"loss": 2.1922, |
|
"mean_token_accuracy": 0.8834270387887955, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.632, |
|
"grad_norm": 6.045809268951416, |
|
"learning_rate": 4.646464646464647e-06, |
|
"loss": 1.8713, |
|
"mean_token_accuracy": 0.9000080078840256, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 7.274755477905273, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 2.0781, |
|
"mean_token_accuracy": 0.8945260941982269, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.648, |
|
"grad_norm": 6.384909152984619, |
|
"learning_rate": 4.444444444444445e-06, |
|
"loss": 2.1529, |
|
"mean_token_accuracy": 0.8765928000211716, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.656, |
|
"grad_norm": 5.835945129394531, |
|
"learning_rate": 4.343434343434344e-06, |
|
"loss": 1.626, |
|
"mean_token_accuracy": 0.9043529778718948, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.664, |
|
"grad_norm": 6.979379177093506, |
|
"learning_rate": 4.242424242424243e-06, |
|
"loss": 2.3105, |
|
"mean_token_accuracy": 0.8783639371395111, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.672, |
|
"grad_norm": 7.117347240447998, |
|
"learning_rate": 4.141414141414142e-06, |
|
"loss": 2.1868, |
|
"mean_token_accuracy": 0.884697675704956, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 6.831372261047363, |
|
"learning_rate": 4.040404040404041e-06, |
|
"loss": 2.1852, |
|
"mean_token_accuracy": 0.8815398663282394, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.6879999999999997, |
|
"grad_norm": 6.203273773193359, |
|
"learning_rate": 3.939393939393939e-06, |
|
"loss": 2.1329, |
|
"mean_token_accuracy": 0.8797174096107483, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.6959999999999997, |
|
"grad_norm": 5.709746837615967, |
|
"learning_rate": 3.8383838383838385e-06, |
|
"loss": 2.0121, |
|
"mean_token_accuracy": 0.8906229883432388, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.7039999999999997, |
|
"grad_norm": 6.700322151184082, |
|
"learning_rate": 3.7373737373737375e-06, |
|
"loss": 2.2537, |
|
"mean_token_accuracy": 0.8837107121944427, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.7119999999999997, |
|
"grad_norm": 5.748446464538574, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 2.199, |
|
"mean_token_accuracy": 0.8685683310031891, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 6.6395697593688965, |
|
"learning_rate": 3.5353535353535352e-06, |
|
"loss": 2.4631, |
|
"mean_token_accuracy": 0.8770331889390945, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.7279999999999998, |
|
"grad_norm": 6.521454334259033, |
|
"learning_rate": 3.4343434343434343e-06, |
|
"loss": 2.7614, |
|
"mean_token_accuracy": 0.8615328371524811, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.7359999999999998, |
|
"grad_norm": 5.956634998321533, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.2834, |
|
"mean_token_accuracy": 0.8763253539800644, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.7439999999999998, |
|
"grad_norm": 7.035250186920166, |
|
"learning_rate": 3.2323232323232324e-06, |
|
"loss": 2.033, |
|
"mean_token_accuracy": 0.8936846852302551, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.752, |
|
"grad_norm": 6.001895904541016, |
|
"learning_rate": 3.1313131313131314e-06, |
|
"loss": 2.005, |
|
"mean_token_accuracy": 0.8896369785070419, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 6.189558506011963, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 2.1698, |
|
"mean_token_accuracy": 0.8766046166419983, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.768, |
|
"grad_norm": 6.994729995727539, |
|
"learning_rate": 2.9292929292929295e-06, |
|
"loss": 2.108, |
|
"mean_token_accuracy": 0.8722548186779022, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.776, |
|
"grad_norm": 5.766332626342773, |
|
"learning_rate": 2.8282828282828286e-06, |
|
"loss": 1.7626, |
|
"mean_token_accuracy": 0.8968187123537064, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.784, |
|
"grad_norm": 6.6420392990112305, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 2.1263, |
|
"mean_token_accuracy": 0.8799799233675003, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.792, |
|
"grad_norm": 6.921932220458984, |
|
"learning_rate": 2.6262626262626263e-06, |
|
"loss": 1.8222, |
|
"mean_token_accuracy": 0.905672699213028, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 6.345583915710449, |
|
"learning_rate": 2.5252525252525253e-06, |
|
"loss": 2.3984, |
|
"mean_token_accuracy": 0.8654076457023621, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.808, |
|
"grad_norm": 6.0304484367370605, |
|
"learning_rate": 2.4242424242424244e-06, |
|
"loss": 2.5167, |
|
"mean_token_accuracy": 0.8724007159471512, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.816, |
|
"grad_norm": 6.487928867340088, |
|
"learning_rate": 2.3232323232323234e-06, |
|
"loss": 2.5901, |
|
"mean_token_accuracy": 0.8569000661373138, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.824, |
|
"grad_norm": 6.210375785827637, |
|
"learning_rate": 2.2222222222222225e-06, |
|
"loss": 1.7313, |
|
"mean_token_accuracy": 0.8982816338539124, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.832, |
|
"grad_norm": 6.7894182205200195, |
|
"learning_rate": 2.1212121212121216e-06, |
|
"loss": 2.121, |
|
"mean_token_accuracy": 0.8915885388851166, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 6.732741355895996, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 2.0609, |
|
"mean_token_accuracy": 0.8849688917398453, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.848, |
|
"grad_norm": 6.280490875244141, |
|
"learning_rate": 1.9191919191919192e-06, |
|
"loss": 2.0463, |
|
"mean_token_accuracy": 0.8867028504610062, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.856, |
|
"grad_norm": 6.721777439117432, |
|
"learning_rate": 1.818181818181818e-06, |
|
"loss": 2.2023, |
|
"mean_token_accuracy": 0.8840513229370117, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.864, |
|
"grad_norm": 6.34207010269165, |
|
"learning_rate": 1.7171717171717171e-06, |
|
"loss": 2.2238, |
|
"mean_token_accuracy": 0.8773442953824997, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.872, |
|
"grad_norm": 6.753549098968506, |
|
"learning_rate": 1.6161616161616162e-06, |
|
"loss": 2.1247, |
|
"mean_token_accuracy": 0.8759674429893494, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 6.267600059509277, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 2.1506, |
|
"mean_token_accuracy": 0.8819513469934464, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.888, |
|
"grad_norm": 6.509019374847412, |
|
"learning_rate": 1.4141414141414143e-06, |
|
"loss": 2.6317, |
|
"mean_token_accuracy": 0.8672285228967667, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.896, |
|
"grad_norm": 6.893660068511963, |
|
"learning_rate": 1.3131313131313131e-06, |
|
"loss": 2.1853, |
|
"mean_token_accuracy": 0.8704394996166229, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.904, |
|
"grad_norm": 6.314718723297119, |
|
"learning_rate": 1.2121212121212122e-06, |
|
"loss": 1.7886, |
|
"mean_token_accuracy": 0.8912533521652222, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.912, |
|
"grad_norm": 7.181535243988037, |
|
"learning_rate": 1.1111111111111112e-06, |
|
"loss": 2.2651, |
|
"mean_token_accuracy": 0.8664899617433548, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 6.317063331604004, |
|
"learning_rate": 1.0101010101010103e-06, |
|
"loss": 2.3394, |
|
"mean_token_accuracy": 0.8771510571241379, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.928, |
|
"grad_norm": 6.195638656616211, |
|
"learning_rate": 9.09090909090909e-07, |
|
"loss": 2.0807, |
|
"mean_token_accuracy": 0.8755189925432205, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.936, |
|
"grad_norm": 6.2105817794799805, |
|
"learning_rate": 8.080808080808081e-07, |
|
"loss": 1.7974, |
|
"mean_token_accuracy": 0.9031495600938797, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.944, |
|
"grad_norm": 6.697377681732178, |
|
"learning_rate": 7.070707070707071e-07, |
|
"loss": 2.0917, |
|
"mean_token_accuracy": 0.8885557353496552, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.952, |
|
"grad_norm": 6.320727825164795, |
|
"learning_rate": 6.060606060606061e-07, |
|
"loss": 2.0905, |
|
"mean_token_accuracy": 0.8876091539859772, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 5.805496692657471, |
|
"learning_rate": 5.050505050505052e-07, |
|
"loss": 2.1719, |
|
"mean_token_accuracy": 0.8817218542098999, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.968, |
|
"grad_norm": 6.263917446136475, |
|
"learning_rate": 4.0404040404040405e-07, |
|
"loss": 1.7863, |
|
"mean_token_accuracy": 0.8985198885202408, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.976, |
|
"grad_norm": 5.801756858825684, |
|
"learning_rate": 3.0303030303030305e-07, |
|
"loss": 2.0907, |
|
"mean_token_accuracy": 0.8869260847568512, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.984, |
|
"grad_norm": 7.1110310554504395, |
|
"learning_rate": 2.0202020202020202e-07, |
|
"loss": 2.3715, |
|
"mean_token_accuracy": 0.8689504116773605, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.992, |
|
"grad_norm": 7.1602911949157715, |
|
"learning_rate": 1.0101010101010101e-07, |
|
"loss": 2.4322, |
|
"mean_token_accuracy": 0.8692635595798492, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 6.362170696258545, |
|
"learning_rate": 0.0, |
|
"loss": 1.9344, |
|
"mean_token_accuracy": 0.8930138498544693, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7732474675200000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|