|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7629947544110635, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019074868860276585, |
|
"grad_norm": 23.25, |
|
"learning_rate": 1.9987282207808727e-05, |
|
"loss": 1.5688, |
|
"mean_token_accuracy": 0.5786232218146324, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003814973772055317, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.997456441561745e-05, |
|
"loss": 1.4629, |
|
"mean_token_accuracy": 0.6007462203502655, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005722460658082976, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.9961846623426175e-05, |
|
"loss": 1.4357, |
|
"mean_token_accuracy": 0.6233335413038731, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007629947544110634, |
|
"grad_norm": 17.75, |
|
"learning_rate": 1.9949128831234897e-05, |
|
"loss": 1.4754, |
|
"mean_token_accuracy": 0.6211968064308167, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.009537434430138292, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.9936411039043622e-05, |
|
"loss": 1.3715, |
|
"mean_token_accuracy": 0.6358444899320602, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.011444921316165951, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.9923693246852348e-05, |
|
"loss": 1.366, |
|
"mean_token_accuracy": 0.6299699194729328, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01335240820219361, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.9910975454661073e-05, |
|
"loss": 1.3211, |
|
"mean_token_accuracy": 0.6434777334332467, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.015259895088221268, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.98982576624698e-05, |
|
"loss": 1.3934, |
|
"mean_token_accuracy": 0.6441360361874103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017167381974248927, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.988553987027852e-05, |
|
"loss": 1.3203, |
|
"mean_token_accuracy": 0.6629756145179272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.019074868860276584, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.9872822078087246e-05, |
|
"loss": 1.3518, |
|
"mean_token_accuracy": 0.623377176374197, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020982355746304245, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.986010428589597e-05, |
|
"loss": 1.3845, |
|
"mean_token_accuracy": 0.623815793544054, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.022889842632331903, |
|
"grad_norm": 15.5625, |
|
"learning_rate": 1.9847386493704694e-05, |
|
"loss": 1.264, |
|
"mean_token_accuracy": 0.6513356983661651, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02479732951835956, |
|
"grad_norm": 15.0, |
|
"learning_rate": 1.983466870151342e-05, |
|
"loss": 1.2541, |
|
"mean_token_accuracy": 0.6393217265605926, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02670481640438722, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.982195090932214e-05, |
|
"loss": 1.3066, |
|
"mean_token_accuracy": 0.6521439477801323, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02861230329041488, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.9809233117130867e-05, |
|
"loss": 1.3278, |
|
"mean_token_accuracy": 0.6283318139612675, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.030519790176442536, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.9796515324939593e-05, |
|
"loss": 1.3905, |
|
"mean_token_accuracy": 0.6241413362324237, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03242727706247019, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.9783797532748318e-05, |
|
"loss": 1.3635, |
|
"mean_token_accuracy": 0.6261423952877522, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"grad_norm": 15.125, |
|
"learning_rate": 1.977107974055704e-05, |
|
"loss": 1.2991, |
|
"mean_token_accuracy": 0.6268114626407624, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.036242250834525515, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.9758361948365766e-05, |
|
"loss": 1.3033, |
|
"mean_token_accuracy": 0.6382434226572513, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.03814973772055317, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.974564415617449e-05, |
|
"loss": 1.3264, |
|
"mean_token_accuracy": 0.6418077692389488, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04005722460658083, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.9732926363983213e-05, |
|
"loss": 1.3281, |
|
"mean_token_accuracy": 0.6261951096355916, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04196471149260849, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.972020857179194e-05, |
|
"loss": 1.3703, |
|
"mean_token_accuracy": 0.6287036083638669, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.043872198378636144, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.970749077960066e-05, |
|
"loss": 1.258, |
|
"mean_token_accuracy": 0.6674343384802341, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.045779685264663805, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.9694772987409387e-05, |
|
"loss": 1.2793, |
|
"mean_token_accuracy": 0.6343503654003143, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.047687172150691466, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.9682055195218112e-05, |
|
"loss": 1.2583, |
|
"mean_token_accuracy": 0.6366156145930291, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04959465903671912, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.9669337403026834e-05, |
|
"loss": 1.2335, |
|
"mean_token_accuracy": 0.6459783628582955, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05150214592274678, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.965661961083556e-05, |
|
"loss": 1.2978, |
|
"mean_token_accuracy": 0.6397392503917217, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05340963280877444, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.9643901818644285e-05, |
|
"loss": 1.407, |
|
"mean_token_accuracy": 0.6118167527019978, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.055317119694802096, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.963118402645301e-05, |
|
"loss": 1.2377, |
|
"mean_token_accuracy": 0.6603214554488659, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05722460658082976, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.9618466234261733e-05, |
|
"loss": 1.2917, |
|
"mean_token_accuracy": 0.6472210057079792, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05913209346685742, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.9605748442070458e-05, |
|
"loss": 1.2779, |
|
"mean_token_accuracy": 0.657531713694334, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06103958035288507, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.9593030649879184e-05, |
|
"loss": 1.3979, |
|
"mean_token_accuracy": 0.6118378482758999, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06294706723891273, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.9580312857687906e-05, |
|
"loss": 1.3048, |
|
"mean_token_accuracy": 0.6219266936182976, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06485455412494039, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.956759506549663e-05, |
|
"loss": 1.3086, |
|
"mean_token_accuracy": 0.6731891065835953, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06676204101096805, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.9554877273305353e-05, |
|
"loss": 1.292, |
|
"mean_token_accuracy": 0.6188046306371688, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.954215948111408e-05, |
|
"loss": 1.2729, |
|
"mean_token_accuracy": 0.6499899953603745, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07057701478302336, |
|
"grad_norm": 16.0, |
|
"learning_rate": 1.9529441688922804e-05, |
|
"loss": 1.306, |
|
"mean_token_accuracy": 0.6558002933859826, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07248450166905103, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.951672389673153e-05, |
|
"loss": 1.3232, |
|
"mean_token_accuracy": 0.6388124503195286, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07439198855507868, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.9504006104540255e-05, |
|
"loss": 1.22, |
|
"mean_token_accuracy": 0.6645313084125519, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.07629947544110634, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.9491288312348978e-05, |
|
"loss": 1.3315, |
|
"mean_token_accuracy": 0.6429102905094624, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.078206962327134, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.9478570520157703e-05, |
|
"loss": 1.2854, |
|
"mean_token_accuracy": 0.6392446413636208, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08011444921316166, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.9465852727966425e-05, |
|
"loss": 1.3131, |
|
"mean_token_accuracy": 0.6423079304397106, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08202193609918931, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.945313493577515e-05, |
|
"loss": 1.343, |
|
"mean_token_accuracy": 0.6454595476388931, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.08392942298521698, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.9440417143583876e-05, |
|
"loss": 1.2251, |
|
"mean_token_accuracy": 0.6403672114014626, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.9427699351392598e-05, |
|
"loss": 1.3682, |
|
"mean_token_accuracy": 0.6467891149222851, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08774439675727229, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.9414981559201324e-05, |
|
"loss": 1.3697, |
|
"mean_token_accuracy": 0.6114679872989655, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08965188364329996, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.940226376701005e-05, |
|
"loss": 1.3222, |
|
"mean_token_accuracy": 0.6396318718791008, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09155937052932761, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.938954597481877e-05, |
|
"loss": 1.2226, |
|
"mean_token_accuracy": 0.6718489579856396, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09346685741535526, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.9376828182627497e-05, |
|
"loss": 1.3327, |
|
"mean_token_accuracy": 0.6280755452811718, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.09537434430138293, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.9364110390436222e-05, |
|
"loss": 1.3255, |
|
"mean_token_accuracy": 0.6481422707438469, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09728183118741059, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.9351392598244948e-05, |
|
"loss": 1.2724, |
|
"mean_token_accuracy": 0.6506570190191269, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.09918931807343824, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.933867480605367e-05, |
|
"loss": 1.199, |
|
"mean_token_accuracy": 0.6626401949673891, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10109680495946591, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.9325957013862396e-05, |
|
"loss": 1.3612, |
|
"mean_token_accuracy": 0.6227854460477829, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.10300429184549356, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.9313239221671118e-05, |
|
"loss": 1.3032, |
|
"mean_token_accuracy": 0.6405224844813346, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10491177873152122, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.9300521429479843e-05, |
|
"loss": 1.2347, |
|
"mean_token_accuracy": 0.6489498361945152, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10681926561754888, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.928780363728857e-05, |
|
"loss": 1.2301, |
|
"mean_token_accuracy": 0.6662219226360321, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10872675250357654, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.927508584509729e-05, |
|
"loss": 1.3138, |
|
"mean_token_accuracy": 0.6375241696834564, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.11063423938960419, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.9262368052906016e-05, |
|
"loss": 1.3117, |
|
"mean_token_accuracy": 0.6601852528750897, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11254172627563186, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.9249650260714742e-05, |
|
"loss": 1.2498, |
|
"mean_token_accuracy": 0.6576425515115261, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.11444921316165951, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.9236932468523467e-05, |
|
"loss": 1.2453, |
|
"mean_token_accuracy": 0.6428922191262245, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11635670004768717, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.9224214676332193e-05, |
|
"loss": 1.1744, |
|
"mean_token_accuracy": 0.673005498945713, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.11826418693371483, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.9211496884140915e-05, |
|
"loss": 1.2113, |
|
"mean_token_accuracy": 0.6825620189309121, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12017167381974249, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.919877909194964e-05, |
|
"loss": 1.1482, |
|
"mean_token_accuracy": 0.6523631751537323, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.12207916070577014, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.9186061299758362e-05, |
|
"loss": 1.2886, |
|
"mean_token_accuracy": 0.6448531322181225, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12398664759179781, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.9173343507567088e-05, |
|
"loss": 1.273, |
|
"mean_token_accuracy": 0.6480912029743194, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12589413447782546, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.916062571537581e-05, |
|
"loss": 1.363, |
|
"mean_token_accuracy": 0.6523602977395058, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12780162136385312, |
|
"grad_norm": 16.0, |
|
"learning_rate": 1.9147907923184536e-05, |
|
"loss": 1.2477, |
|
"mean_token_accuracy": 0.6430979609489441, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.12970910824988077, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.913519013099326e-05, |
|
"loss": 1.1793, |
|
"mean_token_accuracy": 0.6552196487784385, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13161659513590845, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.9122472338801987e-05, |
|
"loss": 1.2032, |
|
"mean_token_accuracy": 0.6600343361496925, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1335240820219361, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.9109754546610712e-05, |
|
"loss": 1.3241, |
|
"mean_token_accuracy": 0.6307924754917622, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13543156890796376, |
|
"grad_norm": 18.875, |
|
"learning_rate": 1.9097036754419434e-05, |
|
"loss": 1.2946, |
|
"mean_token_accuracy": 0.6583775602281093, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 1.908431896222816e-05, |
|
"loss": 1.3141, |
|
"mean_token_accuracy": 0.6407140150666237, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13924654268001907, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.9071601170036885e-05, |
|
"loss": 1.3233, |
|
"mean_token_accuracy": 0.6416009657084942, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.14115402956604672, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.9058883377845607e-05, |
|
"loss": 1.1646, |
|
"mean_token_accuracy": 0.6636185184121132, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1430615164520744, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.9046165585654333e-05, |
|
"loss": 1.3162, |
|
"mean_token_accuracy": 0.6456755690276623, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.14496900333810206, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.9033447793463055e-05, |
|
"loss": 1.2697, |
|
"mean_token_accuracy": 0.6551977261900902, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1468764902241297, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 1.902073000127178e-05, |
|
"loss": 1.2712, |
|
"mean_token_accuracy": 0.6613594964146614, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.14878397711015737, |
|
"grad_norm": 15.375, |
|
"learning_rate": 1.9008012209080503e-05, |
|
"loss": 1.1864, |
|
"mean_token_accuracy": 0.6685264468193054, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.15069146399618502, |
|
"grad_norm": 18.375, |
|
"learning_rate": 1.8995294416889228e-05, |
|
"loss": 1.3601, |
|
"mean_token_accuracy": 0.6272764652967453, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.15259895088221268, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.8982576624697954e-05, |
|
"loss": 1.3435, |
|
"mean_token_accuracy": 0.6371615134179592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15450643776824036, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.896985883250668e-05, |
|
"loss": 1.3172, |
|
"mean_token_accuracy": 0.6476671509444714, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.156413924654268, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.8957141040315405e-05, |
|
"loss": 1.204, |
|
"mean_token_accuracy": 0.658133564144373, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15832141154029566, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.8944423248124127e-05, |
|
"loss": 1.3395, |
|
"mean_token_accuracy": 0.6478057160973549, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.16022889842632332, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.8931705455932852e-05, |
|
"loss": 1.2254, |
|
"mean_token_accuracy": 0.655465979874134, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.16213638531235097, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.8918987663741578e-05, |
|
"loss": 1.3114, |
|
"mean_token_accuracy": 0.6299714539200068, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.16404387219837863, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.89062698715503e-05, |
|
"loss": 1.2496, |
|
"mean_token_accuracy": 0.6370454408228398, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1659513590844063, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.8893552079359025e-05, |
|
"loss": 1.2617, |
|
"mean_token_accuracy": 0.6422188803553581, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.16785884597043396, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.8880834287167747e-05, |
|
"loss": 1.3337, |
|
"mean_token_accuracy": 0.6599226906895638, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16976633285646162, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.8868116494976473e-05, |
|
"loss": 1.2401, |
|
"mean_token_accuracy": 0.6384981378912926, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 1.88553987027852e-05, |
|
"loss": 1.3429, |
|
"mean_token_accuracy": 0.6290579028427601, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17358130662851692, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.8842680910593924e-05, |
|
"loss": 1.26, |
|
"mean_token_accuracy": 0.6353619039058686, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.17548879351454458, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.882996311840265e-05, |
|
"loss": 1.2258, |
|
"mean_token_accuracy": 0.6596131481230258, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17739628040057226, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.881724532621137e-05, |
|
"loss": 1.3572, |
|
"mean_token_accuracy": 0.6373824059963227, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1793037672865999, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.8804527534020097e-05, |
|
"loss": 1.2938, |
|
"mean_token_accuracy": 0.6647212252020835, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.18121125417262757, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.879180974182882e-05, |
|
"loss": 1.2844, |
|
"mean_token_accuracy": 0.6200387306511402, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.18311874105865522, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.8779091949637545e-05, |
|
"loss": 1.2827, |
|
"mean_token_accuracy": 0.6431592881679535, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18502622794468288, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.876637415744627e-05, |
|
"loss": 1.2237, |
|
"mean_token_accuracy": 0.667415677011013, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.18693371483071053, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.8753656365254992e-05, |
|
"loss": 1.376, |
|
"mean_token_accuracy": 0.6330628030002117, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1888412017167382, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.8740938573063718e-05, |
|
"loss": 1.3092, |
|
"mean_token_accuracy": 0.6471766166388988, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.19074868860276586, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.872822078087244e-05, |
|
"loss": 1.1906, |
|
"mean_token_accuracy": 0.6722134962677956, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19265617548879352, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.8715502988681165e-05, |
|
"loss": 1.3313, |
|
"mean_token_accuracy": 0.6335577562451362, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.19456366237482117, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.870278519648989e-05, |
|
"loss": 1.2115, |
|
"mean_token_accuracy": 0.6548780754208565, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.19647114926084883, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.8690067404298616e-05, |
|
"loss": 1.3229, |
|
"mean_token_accuracy": 0.6398176200687885, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.19837863614687648, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.8677349612107342e-05, |
|
"loss": 1.2795, |
|
"mean_token_accuracy": 0.6450474753975868, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.20028612303290416, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.8664631819916064e-05, |
|
"loss": 1.3635, |
|
"mean_token_accuracy": 0.6569343976676464, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.20219360991893182, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.865191402772479e-05, |
|
"loss": 1.2394, |
|
"mean_token_accuracy": 0.6203682988882064, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.20410109680495947, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.863919623553351e-05, |
|
"loss": 1.2598, |
|
"mean_token_accuracy": 0.6244672931730747, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.8626478443342237e-05, |
|
"loss": 1.2364, |
|
"mean_token_accuracy": 0.6448377221822739, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.20791607057701478, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.8613760651150963e-05, |
|
"loss": 1.3184, |
|
"mean_token_accuracy": 0.6233433380722999, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.20982355746304243, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.8601042858959685e-05, |
|
"loss": 1.1649, |
|
"mean_token_accuracy": 0.6621956214308738, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2117310443490701, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.858832506676841e-05, |
|
"loss": 1.2191, |
|
"mean_token_accuracy": 0.6589386224746704, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.21363853123509777, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.8575607274577136e-05, |
|
"loss": 1.1812, |
|
"mean_token_accuracy": 0.6640269085764885, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.21554601812112542, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.856288948238586e-05, |
|
"loss": 1.3898, |
|
"mean_token_accuracy": 0.6331407696008682, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.21745350500715308, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.8550171690194583e-05, |
|
"loss": 1.1636, |
|
"mean_token_accuracy": 0.6876399561762809, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21936099189318073, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.853745389800331e-05, |
|
"loss": 1.1624, |
|
"mean_token_accuracy": 0.683189244568348, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.22126847877920838, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.8524736105812034e-05, |
|
"loss": 1.1666, |
|
"mean_token_accuracy": 0.6829775631427765, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.22317596566523606, |
|
"grad_norm": 17.625, |
|
"learning_rate": 1.8512018313620756e-05, |
|
"loss": 1.3125, |
|
"mean_token_accuracy": 0.6385770931839942, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.22508345255126372, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.8499300521429482e-05, |
|
"loss": 1.2159, |
|
"mean_token_accuracy": 0.6778845816850663, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.22699093943729137, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.8486582729238204e-05, |
|
"loss": 1.1959, |
|
"mean_token_accuracy": 0.6777172073721885, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.22889842632331903, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.847386493704693e-05, |
|
"loss": 1.2577, |
|
"mean_token_accuracy": 0.6450985379517078, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23080591320934668, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.8461147144855655e-05, |
|
"loss": 1.1582, |
|
"mean_token_accuracy": 0.6681318923830986, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.23271340009537433, |
|
"grad_norm": 19.75, |
|
"learning_rate": 1.8448429352664377e-05, |
|
"loss": 1.2834, |
|
"mean_token_accuracy": 0.6435618877410889, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.23462088698140202, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.8435711560473103e-05, |
|
"loss": 1.2699, |
|
"mean_token_accuracy": 0.6606324508786201, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.23652837386742967, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.8422993768281828e-05, |
|
"loss": 1.2522, |
|
"mean_token_accuracy": 0.6440395541489125, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.23843586075345732, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 1.8410275976090554e-05, |
|
"loss": 1.2139, |
|
"mean_token_accuracy": 0.6698227688670159, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.24034334763948498, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.8397558183899276e-05, |
|
"loss": 1.2841, |
|
"mean_token_accuracy": 0.6581710763275623, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.24225083452551263, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.8384840391708e-05, |
|
"loss": 1.2246, |
|
"mean_token_accuracy": 0.6571968667209148, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.24415832141154029, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.8372122599516727e-05, |
|
"loss": 1.2693, |
|
"mean_token_accuracy": 0.6628240890800953, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.24606580829756797, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.835940480732545e-05, |
|
"loss": 1.2361, |
|
"mean_token_accuracy": 0.6445854142308235, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.24797329518359562, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.8346687015134174e-05, |
|
"loss": 1.2374, |
|
"mean_token_accuracy": 0.6404064007103443, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24988078206962328, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.8333969222942896e-05, |
|
"loss": 1.1423, |
|
"mean_token_accuracy": 0.6834298320114612, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.25178826895565093, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.8321251430751622e-05, |
|
"loss": 1.1964, |
|
"mean_token_accuracy": 0.6344361655414105, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2536957558416786, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.8308533638560347e-05, |
|
"loss": 1.183, |
|
"mean_token_accuracy": 0.6825651362538337, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.25560324272770624, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.8295815846369073e-05, |
|
"loss": 1.1489, |
|
"mean_token_accuracy": 0.6593642435967922, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 17.0, |
|
"learning_rate": 1.82830980541778e-05, |
|
"loss": 1.3731, |
|
"mean_token_accuracy": 0.6474427200853825, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.25941821649976154, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.827038026198652e-05, |
|
"loss": 1.2505, |
|
"mean_token_accuracy": 0.6625294893980026, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2613257033857892, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.8257662469795246e-05, |
|
"loss": 1.3258, |
|
"mean_token_accuracy": 0.6423907749354839, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2632331902718169, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.8244944677603968e-05, |
|
"loss": 1.1728, |
|
"mean_token_accuracy": 0.6944520533084869, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.26514067715784456, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 1.8232226885412694e-05, |
|
"loss": 1.3058, |
|
"mean_token_accuracy": 0.6184810683131218, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.2670481640438722, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.821950909322142e-05, |
|
"loss": 1.2324, |
|
"mean_token_accuracy": 0.6492418631911278, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.26895565092989987, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.820679130103014e-05, |
|
"loss": 1.249, |
|
"mean_token_accuracy": 0.6514696642756462, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2708631378159275, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.8194073508838867e-05, |
|
"loss": 1.1874, |
|
"mean_token_accuracy": 0.6706778854131699, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2727706247019552, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.8181355716647592e-05, |
|
"loss": 1.1423, |
|
"mean_token_accuracy": 0.6606415674090386, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.8168637924456314e-05, |
|
"loss": 1.2079, |
|
"mean_token_accuracy": 0.6652269288897514, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2765855984740105, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.815592013226504e-05, |
|
"loss": 1.1645, |
|
"mean_token_accuracy": 0.6542063765227795, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.27849308536003814, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.8143202340073765e-05, |
|
"loss": 1.1272, |
|
"mean_token_accuracy": 0.6796695061028004, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2804005722460658, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.813048454788249e-05, |
|
"loss": 1.2422, |
|
"mean_token_accuracy": 0.6447897091507911, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.28230805913209345, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.8117766755691213e-05, |
|
"loss": 1.2117, |
|
"mean_token_accuracy": 0.6523264884948731, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2842155460181211, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.810504896349994e-05, |
|
"loss": 1.2406, |
|
"mean_token_accuracy": 0.6362795114517212, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.2861230329041488, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.809233117130866e-05, |
|
"loss": 1.1137, |
|
"mean_token_accuracy": 0.6698605574667453, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.28803051979017646, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.8079613379117386e-05, |
|
"loss": 1.1863, |
|
"mean_token_accuracy": 0.6513051472604274, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2899380066762041, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.806689558692611e-05, |
|
"loss": 1.2335, |
|
"mean_token_accuracy": 0.689585417509079, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2918454935622318, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.8054177794734834e-05, |
|
"loss": 1.1806, |
|
"mean_token_accuracy": 0.6750295326113701, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2937529804482594, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.804146000254356e-05, |
|
"loss": 1.2288, |
|
"mean_token_accuracy": 0.6504852309823036, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2956604673342871, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.8028742210352285e-05, |
|
"loss": 1.2207, |
|
"mean_token_accuracy": 0.6786825984716416, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.29756795422031473, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.801602441816101e-05, |
|
"loss": 1.2771, |
|
"mean_token_accuracy": 0.6656466156244278, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2994754411063424, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.8003306625969736e-05, |
|
"loss": 1.2357, |
|
"mean_token_accuracy": 0.6720115698873996, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.30138292799237004, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.7990588833778458e-05, |
|
"loss": 1.2388, |
|
"mean_token_accuracy": 0.6488339100033045, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3032904148783977, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.7977871041587183e-05, |
|
"loss": 1.1258, |
|
"mean_token_accuracy": 0.6850755415856838, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.30519790176442535, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.7965153249395905e-05, |
|
"loss": 1.2621, |
|
"mean_token_accuracy": 0.6553989514708519, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.307105388650453, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.795243545720463e-05, |
|
"loss": 1.084, |
|
"mean_token_accuracy": 0.6827043130993843, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.3090128755364807, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.7939717665013353e-05, |
|
"loss": 1.1707, |
|
"mean_token_accuracy": 0.6674947261810302, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.31092036242250837, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.792699987282208e-05, |
|
"loss": 1.2618, |
|
"mean_token_accuracy": 0.6420316450297833, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.312827849308536, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.7914282080630804e-05, |
|
"loss": 1.2122, |
|
"mean_token_accuracy": 0.6519798688590527, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3147353361945637, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.790156428843953e-05, |
|
"loss": 1.3042, |
|
"mean_token_accuracy": 0.6403947852551937, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.31664282308059133, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.788884649624825e-05, |
|
"loss": 1.2524, |
|
"mean_token_accuracy": 0.6245795480906964, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.318550309966619, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.7876128704056977e-05, |
|
"loss": 1.2826, |
|
"mean_token_accuracy": 0.6210857503116131, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.32045779685264664, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.7863410911865703e-05, |
|
"loss": 1.2216, |
|
"mean_token_accuracy": 0.664474394917488, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3223652837386743, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.7850693119674428e-05, |
|
"loss": 1.1351, |
|
"mean_token_accuracy": 0.647052352130413, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.32427277062470194, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.783797532748315e-05, |
|
"loss": 1.1048, |
|
"mean_token_accuracy": 0.7011352255940437, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3261802575107296, |
|
"grad_norm": 15.75, |
|
"learning_rate": 1.7825257535291876e-05, |
|
"loss": 1.2485, |
|
"mean_token_accuracy": 0.6584809772670269, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.32808774439675725, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.7812539743100598e-05, |
|
"loss": 1.2427, |
|
"mean_token_accuracy": 0.6549804627895355, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3299952312827849, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.7799821950909323e-05, |
|
"loss": 1.0888, |
|
"mean_token_accuracy": 0.690093956142664, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.3319027181688126, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 1.7787104158718045e-05, |
|
"loss": 1.1655, |
|
"mean_token_accuracy": 0.6605223380029202, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.33381020505484027, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.777438636652677e-05, |
|
"loss": 1.1255, |
|
"mean_token_accuracy": 0.6727245457470417, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3357176919408679, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.7761668574335496e-05, |
|
"loss": 1.1536, |
|
"mean_token_accuracy": 0.6633619382977486, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3376251788268956, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.7748950782144222e-05, |
|
"loss": 1.1295, |
|
"mean_token_accuracy": 0.6744188234210015, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.33953266571292323, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.7736232989952947e-05, |
|
"loss": 1.1849, |
|
"mean_token_accuracy": 0.6710595056414604, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3414401525989509, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.772351519776167e-05, |
|
"loss": 1.1048, |
|
"mean_token_accuracy": 0.686171543598175, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.7710797405570395e-05, |
|
"loss": 1.1954, |
|
"mean_token_accuracy": 0.6756417036056519, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3452551263710062, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.7698079613379117e-05, |
|
"loss": 1.1613, |
|
"mean_token_accuracy": 0.6741260103881359, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.34716261325703385, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 1.7685361821187843e-05, |
|
"loss": 1.2515, |
|
"mean_token_accuracy": 0.6590819746255875, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3490701001430615, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.7672644028996568e-05, |
|
"loss": 1.2335, |
|
"mean_token_accuracy": 0.6556867949664593, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.35097758702908916, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.765992623680529e-05, |
|
"loss": 1.3305, |
|
"mean_token_accuracy": 0.6548508778214455, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3528850739151168, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.7647208444614016e-05, |
|
"loss": 1.1825, |
|
"mean_token_accuracy": 0.6824756883084774, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3547925608011445, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.763449065242274e-05, |
|
"loss": 1.2301, |
|
"mean_token_accuracy": 0.6782255977392196, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3567000476871722, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.7621772860231467e-05, |
|
"loss": 1.175, |
|
"mean_token_accuracy": 0.6592901304364205, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.3586075345731998, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.760905506804019e-05, |
|
"loss": 1.1576, |
|
"mean_token_accuracy": 0.6741487175226212, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3605150214592275, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.7596337275848914e-05, |
|
"loss": 1.2168, |
|
"mean_token_accuracy": 0.6609480068087578, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.36242250834525513, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.758361948365764e-05, |
|
"loss": 1.2009, |
|
"mean_token_accuracy": 0.6507039844989777, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3643299952312828, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.7570901691466362e-05, |
|
"loss": 1.1247, |
|
"mean_token_accuracy": 0.6828011214733124, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.36623748211731044, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.7558183899275088e-05, |
|
"loss": 1.2232, |
|
"mean_token_accuracy": 0.6623214483261108, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3681449690033381, |
|
"grad_norm": 16.5, |
|
"learning_rate": 1.754546610708381e-05, |
|
"loss": 1.3721, |
|
"mean_token_accuracy": 0.6213653467595577, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.37005245588936575, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.7532748314892535e-05, |
|
"loss": 1.1464, |
|
"mean_token_accuracy": 0.6750129237771034, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3719599427753934, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.752003052270126e-05, |
|
"loss": 1.1425, |
|
"mean_token_accuracy": 0.6775874204933643, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.37386742966142106, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.7507312730509983e-05, |
|
"loss": 1.2007, |
|
"mean_token_accuracy": 0.6537452466785908, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3757749165474487, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.7494594938318708e-05, |
|
"loss": 1.2564, |
|
"mean_token_accuracy": 0.6193661518394947, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3776824034334764, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.7481877146127434e-05, |
|
"loss": 1.1159, |
|
"mean_token_accuracy": 0.6820607483386993, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3795898903195041, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.746915935393616e-05, |
|
"loss": 1.0954, |
|
"mean_token_accuracy": 0.681636007130146, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.38149737720553173, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.7456441561744885e-05, |
|
"loss": 1.1578, |
|
"mean_token_accuracy": 0.6307322606444359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3834048640915594, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.7443723769553607e-05, |
|
"loss": 1.1466, |
|
"mean_token_accuracy": 0.6797019310295582, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.38531235097758704, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.7431005977362332e-05, |
|
"loss": 1.2437, |
|
"mean_token_accuracy": 0.665465061366558, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3872198378636147, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.7418288185171054e-05, |
|
"loss": 1.1273, |
|
"mean_token_accuracy": 0.6660737812519073, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.38912732474964234, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.740557039297978e-05, |
|
"loss": 1.0377, |
|
"mean_token_accuracy": 0.6913766637444496, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.39103481163567, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.7392852600788502e-05, |
|
"loss": 1.2057, |
|
"mean_token_accuracy": 0.6652260690927505, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.39294229852169765, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.7380134808597228e-05, |
|
"loss": 1.1664, |
|
"mean_token_accuracy": 0.6586895115673542, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3948497854077253, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.7367417016405953e-05, |
|
"loss": 1.2026, |
|
"mean_token_accuracy": 0.6655789151787758, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.39675727229375296, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.735469922421468e-05, |
|
"loss": 1.1472, |
|
"mean_token_accuracy": 0.67775809019804, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3986647591797806, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.7341981432023404e-05, |
|
"loss": 1.2337, |
|
"mean_token_accuracy": 0.6594470135867596, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.4005722460658083, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.7329263639832126e-05, |
|
"loss": 1.1844, |
|
"mean_token_accuracy": 0.6957078948616982, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.402479732951836, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.731654584764085e-05, |
|
"loss": 1.0933, |
|
"mean_token_accuracy": 0.6744157910346985, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.40438721983786363, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.7303828055449577e-05, |
|
"loss": 1.2524, |
|
"mean_token_accuracy": 0.6677799835801125, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4062947067238913, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.72911102632583e-05, |
|
"loss": 1.0756, |
|
"mean_token_accuracy": 0.689930209517479, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.40820219360991894, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.7278392471067025e-05, |
|
"loss": 1.1762, |
|
"mean_token_accuracy": 0.6775659635663033, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4101096804959466, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.7265674678875747e-05, |
|
"loss": 1.2004, |
|
"mean_token_accuracy": 0.6647816874086857, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.7252956886684472e-05, |
|
"loss": 1.0648, |
|
"mean_token_accuracy": 0.7046946279704571, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4139246542680019, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.7240239094493198e-05, |
|
"loss": 1.2307, |
|
"mean_token_accuracy": 0.6585713855922222, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.41583214115402956, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.722752130230192e-05, |
|
"loss": 1.212, |
|
"mean_token_accuracy": 0.6784304775297642, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4177396280400572, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.7214803510110646e-05, |
|
"loss": 1.1185, |
|
"mean_token_accuracy": 0.690502467751503, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.41964711492608486, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.720208571791937e-05, |
|
"loss": 1.2032, |
|
"mean_token_accuracy": 0.643844348937273, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4215546018121125, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 1.7189367925728097e-05, |
|
"loss": 1.1605, |
|
"mean_token_accuracy": 0.6835091434419155, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.4234620886981402, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.717665013353682e-05, |
|
"loss": 1.3624, |
|
"mean_token_accuracy": 0.6280046261847019, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4253695755841679, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.7163932341345544e-05, |
|
"loss": 1.2297, |
|
"mean_token_accuracy": 0.6916924893856049, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.42727706247019553, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.715121454915427e-05, |
|
"loss": 1.1743, |
|
"mean_token_accuracy": 0.6540060944855213, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.7138496756962992e-05, |
|
"loss": 1.1755, |
|
"mean_token_accuracy": 0.6581062689423561, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.43109203624225084, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.7125778964771717e-05, |
|
"loss": 1.2194, |
|
"mean_token_accuracy": 0.665621517598629, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4329995231282785, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.711306117258044e-05, |
|
"loss": 1.0739, |
|
"mean_token_accuracy": 0.709334583580494, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.43490701001430615, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.7100343380389165e-05, |
|
"loss": 1.1086, |
|
"mean_token_accuracy": 0.6792764872312546, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4368144969003338, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.708762558819789e-05, |
|
"loss": 1.263, |
|
"mean_token_accuracy": 0.6668927378952503, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.43872198378636146, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.7074907796006616e-05, |
|
"loss": 1.1677, |
|
"mean_token_accuracy": 0.6571429625153542, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4406294706723891, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.706219000381534e-05, |
|
"loss": 1.102, |
|
"mean_token_accuracy": 0.6752917438745498, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.44253695755841677, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.7049472211624063e-05, |
|
"loss": 1.1673, |
|
"mean_token_accuracy": 0.6898197934031487, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.703675441943279e-05, |
|
"loss": 1.1781, |
|
"mean_token_accuracy": 0.6933842703700066, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.44635193133047213, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.702403662724151e-05, |
|
"loss": 1.1156, |
|
"mean_token_accuracy": 0.6760947778820992, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.4482594182164998, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.7011318835050237e-05, |
|
"loss": 1.1263, |
|
"mean_token_accuracy": 0.657148540019989, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.45016690510252744, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.6998601042858962e-05, |
|
"loss": 1.1793, |
|
"mean_token_accuracy": 0.6318694293498993, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4520743919885551, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.6985883250667684e-05, |
|
"loss": 1.1116, |
|
"mean_token_accuracy": 0.6872119233012199, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.45398187887458274, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.697316545847641e-05, |
|
"loss": 1.2359, |
|
"mean_token_accuracy": 0.6691461108624935, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4558893657606104, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.6960447666285135e-05, |
|
"loss": 1.0583, |
|
"mean_token_accuracy": 0.6924574553966523, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.45779685264663805, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.6947729874093857e-05, |
|
"loss": 1.1006, |
|
"mean_token_accuracy": 0.6897717788815498, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4597043395326657, |
|
"grad_norm": 17.375, |
|
"learning_rate": 1.6935012081902583e-05, |
|
"loss": 1.2088, |
|
"mean_token_accuracy": 0.6763196066021919, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.46161182641869336, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.6922294289711308e-05, |
|
"loss": 1.2367, |
|
"mean_token_accuracy": 0.6802153252065182, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.463519313304721, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.6909576497520034e-05, |
|
"loss": 1.1484, |
|
"mean_token_accuracy": 0.67167152389884, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.46542680019074867, |
|
"grad_norm": 11.25, |
|
"learning_rate": 1.6896858705328756e-05, |
|
"loss": 1.0818, |
|
"mean_token_accuracy": 0.6954056262969971, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.4673342870767763, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.688414091313748e-05, |
|
"loss": 1.2057, |
|
"mean_token_accuracy": 0.6596958786249161, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.46924177396280403, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.6871423120946204e-05, |
|
"loss": 1.121, |
|
"mean_token_accuracy": 0.6667362228035927, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4711492608488317, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.685870532875493e-05, |
|
"loss": 1.2371, |
|
"mean_token_accuracy": 0.6504265196621418, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.47305674773485934, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.6845987536563655e-05, |
|
"loss": 1.2297, |
|
"mean_token_accuracy": 0.6640058435499668, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.474964234620887, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.6833269744372377e-05, |
|
"loss": 1.2614, |
|
"mean_token_accuracy": 0.6515400633215904, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.47687172150691465, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.6820551952181102e-05, |
|
"loss": 1.1777, |
|
"mean_token_accuracy": 0.6837553754448891, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4787792083929423, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.6807834159989828e-05, |
|
"loss": 1.2948, |
|
"mean_token_accuracy": 0.6862575516104699, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.6795116367798553e-05, |
|
"loss": 1.0523, |
|
"mean_token_accuracy": 0.707449996471405, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4825941821649976, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.6782398575607275e-05, |
|
"loss": 1.193, |
|
"mean_token_accuracy": 0.6783672124147415, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.48450166905102526, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.6769680783416e-05, |
|
"loss": 1.2587, |
|
"mean_token_accuracy": 0.6748954504728317, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4864091559370529, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.6756962991224726e-05, |
|
"loss": 1.1465, |
|
"mean_token_accuracy": 0.6888084650039673, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.48831664282308057, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.674424519903345e-05, |
|
"loss": 1.1009, |
|
"mean_token_accuracy": 0.6846926853060722, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4902241297091082, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.6731527406842174e-05, |
|
"loss": 1.1359, |
|
"mean_token_accuracy": 0.6763195641338825, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.49213161659513593, |
|
"grad_norm": 15.75, |
|
"learning_rate": 1.6718809614650896e-05, |
|
"loss": 1.0937, |
|
"mean_token_accuracy": 0.6663194999098778, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4940391034811636, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.670609182245962e-05, |
|
"loss": 1.1554, |
|
"mean_token_accuracy": 0.6896326825022697, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.49594659036719124, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.6693374030268347e-05, |
|
"loss": 1.1328, |
|
"mean_token_accuracy": 0.6875351458787918, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4978540772532189, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.6680656238077072e-05, |
|
"loss": 1.1569, |
|
"mean_token_accuracy": 0.6798019893467426, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.49976156413924655, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.6667938445885795e-05, |
|
"loss": 1.2192, |
|
"mean_token_accuracy": 0.6738032080233097, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5016690510252741, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.665522065369452e-05, |
|
"loss": 1.131, |
|
"mean_token_accuracy": 0.6774978704750538, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5035765379113019, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.6642502861503246e-05, |
|
"loss": 1.0869, |
|
"mean_token_accuracy": 0.6816208437085152, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5054840247973296, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.6629785069311968e-05, |
|
"loss": 1.1625, |
|
"mean_token_accuracy": 0.6730111822485924, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5073915116833572, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.6617067277120693e-05, |
|
"loss": 1.1635, |
|
"mean_token_accuracy": 0.6660488948225975, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5092989985693849, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.660434948492942e-05, |
|
"loss": 1.1048, |
|
"mean_token_accuracy": 0.6875575929880142, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5112064854554125, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.659163169273814e-05, |
|
"loss": 1.0917, |
|
"mean_token_accuracy": 0.6965513400733471, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5131139723414402, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.6578913900546866e-05, |
|
"loss": 1.0532, |
|
"mean_token_accuracy": 0.6765949100255966, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.656619610835559e-05, |
|
"loss": 1.225, |
|
"mean_token_accuracy": 0.6685496076941491, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5169289461134955, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.6553478316164314e-05, |
|
"loss": 1.2317, |
|
"mean_token_accuracy": 0.6922548785805702, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5188364329995231, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 1.654076052397304e-05, |
|
"loss": 1.1384, |
|
"mean_token_accuracy": 0.6778303541243076, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5207439198855508, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.6528042731781765e-05, |
|
"loss": 1.0721, |
|
"mean_token_accuracy": 0.6984316289424897, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.5226514067715784, |
|
"grad_norm": 15.125, |
|
"learning_rate": 1.651532493959049e-05, |
|
"loss": 1.1862, |
|
"mean_token_accuracy": 0.6730318419635296, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5245588936576061, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.6502607147399213e-05, |
|
"loss": 1.0065, |
|
"mean_token_accuracy": 0.7321307882666588, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5264663805436338, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.6489889355207938e-05, |
|
"loss": 1.3213, |
|
"mean_token_accuracy": 0.6621546871960163, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5283738674296614, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 1.647717156301666e-05, |
|
"loss": 1.219, |
|
"mean_token_accuracy": 0.6426276586949825, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5302813543156891, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.6464453770825386e-05, |
|
"loss": 1.0934, |
|
"mean_token_accuracy": 0.6920969501137734, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5321888412017167, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.645173597863411e-05, |
|
"loss": 1.1581, |
|
"mean_token_accuracy": 0.6565943203866482, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5340963280877444, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.6439018186442833e-05, |
|
"loss": 1.1194, |
|
"mean_token_accuracy": 0.6673160851001739, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.536003814973772, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.642630039425156e-05, |
|
"loss": 1.1693, |
|
"mean_token_accuracy": 0.6715315200388432, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5379113018597997, |
|
"grad_norm": 15.0, |
|
"learning_rate": 1.6413582602060284e-05, |
|
"loss": 1.1414, |
|
"mean_token_accuracy": 0.6831993453204632, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5398187887458273, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.640086480986901e-05, |
|
"loss": 1.1978, |
|
"mean_token_accuracy": 0.6750304147601127, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.541726275631855, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.6388147017677732e-05, |
|
"loss": 1.1056, |
|
"mean_token_accuracy": 0.6595890045166015, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5436337625178826, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.6375429225486457e-05, |
|
"loss": 1.1298, |
|
"mean_token_accuracy": 0.684663999825716, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5455412494039104, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.6362711433295183e-05, |
|
"loss": 1.0179, |
|
"mean_token_accuracy": 0.7125121988356113, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.547448736289938, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.6349993641103905e-05, |
|
"loss": 1.1698, |
|
"mean_token_accuracy": 0.6814395613968373, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.633727584891263e-05, |
|
"loss": 1.2112, |
|
"mean_token_accuracy": 0.6521120138466359, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5512637100619934, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.6324558056721353e-05, |
|
"loss": 1.0544, |
|
"mean_token_accuracy": 0.7097316659986973, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.553171196948021, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.6311840264530078e-05, |
|
"loss": 1.1943, |
|
"mean_token_accuracy": 0.6261555813252926, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5550786838340487, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.6299122472338804e-05, |
|
"loss": 1.2382, |
|
"mean_token_accuracy": 0.6481642045080662, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.5569861707200763, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.6286404680147526e-05, |
|
"loss": 1.1301, |
|
"mean_token_accuracy": 0.6791691981256008, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.558893657606104, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.627368688795625e-05, |
|
"loss": 1.1533, |
|
"mean_token_accuracy": 0.6759828574955463, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.5608011444921316, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 1.6260969095764977e-05, |
|
"loss": 1.1125, |
|
"mean_token_accuracy": 0.6778176836669445, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5627086313781593, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.6248251303573702e-05, |
|
"loss": 1.194, |
|
"mean_token_accuracy": 0.6579376481473446, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5646161182641869, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.6235533511382428e-05, |
|
"loss": 1.1965, |
|
"mean_token_accuracy": 0.6491686496883631, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5665236051502146, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.622281571919115e-05, |
|
"loss": 1.1953, |
|
"mean_token_accuracy": 0.6784386828541755, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.5684310920362422, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.6210097926999875e-05, |
|
"loss": 1.2534, |
|
"mean_token_accuracy": 0.663827870041132, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5703385789222699, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.6197380134808597e-05, |
|
"loss": 1.163, |
|
"mean_token_accuracy": 0.6678325220942497, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.5722460658082976, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.6184662342617323e-05, |
|
"loss": 1.1818, |
|
"mean_token_accuracy": 0.6880650117993354, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5741535526943252, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.6171944550426045e-05, |
|
"loss": 1.1857, |
|
"mean_token_accuracy": 0.6938965663313865, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.5760610395803529, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.615922675823477e-05, |
|
"loss": 1.1482, |
|
"mean_token_accuracy": 0.696337477862835, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5779685264663805, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.6146508966043496e-05, |
|
"loss": 1.112, |
|
"mean_token_accuracy": 0.6995782241225242, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.5798760133524082, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.613379117385222e-05, |
|
"loss": 1.157, |
|
"mean_token_accuracy": 0.6632382079958916, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5817835002384358, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.6121073381660947e-05, |
|
"loss": 1.1297, |
|
"mean_token_accuracy": 0.6936305865645409, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5836909871244635, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.610835558946967e-05, |
|
"loss": 1.182, |
|
"mean_token_accuracy": 0.6679765857756138, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5855984740104911, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.6095637797278395e-05, |
|
"loss": 1.1584, |
|
"mean_token_accuracy": 0.6810820989310742, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.5875059608965189, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.608292000508712e-05, |
|
"loss": 1.173, |
|
"mean_token_accuracy": 0.6857012517750263, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5894134477825465, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.6070202212895842e-05, |
|
"loss": 1.1198, |
|
"mean_token_accuracy": 0.6993751585483551, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.5913209346685742, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.6057484420704568e-05, |
|
"loss": 1.1397, |
|
"mean_token_accuracy": 0.682830485701561, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5932284215546018, |
|
"grad_norm": 15.0, |
|
"learning_rate": 1.604476662851329e-05, |
|
"loss": 1.1759, |
|
"mean_token_accuracy": 0.6623894415795804, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.5951359084406295, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.6032048836322015e-05, |
|
"loss": 1.1408, |
|
"mean_token_accuracy": 0.658209715038538, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5970433953266572, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.601933104413074e-05, |
|
"loss": 1.1133, |
|
"mean_token_accuracy": 0.7014020837843418, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.5989508822126848, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.6006613251939463e-05, |
|
"loss": 1.1147, |
|
"mean_token_accuracy": 0.6807858303189278, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.599389545974819e-05, |
|
"loss": 1.1151, |
|
"mean_token_accuracy": 0.6718579567968845, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6027658559847401, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.5981177667556914e-05, |
|
"loss": 1.0995, |
|
"mean_token_accuracy": 0.6849846243858337, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6046733428707678, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.596845987536564e-05, |
|
"loss": 1.2112, |
|
"mean_token_accuracy": 0.6757240429520607, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6065808297567954, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.595574208317436e-05, |
|
"loss": 1.1318, |
|
"mean_token_accuracy": 0.6792904518544673, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6084883166428231, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.5943024290983087e-05, |
|
"loss": 1.1354, |
|
"mean_token_accuracy": 0.6702633060514926, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6103958035288507, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.5930306498791813e-05, |
|
"loss": 1.0983, |
|
"mean_token_accuracy": 0.6991154357790947, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6123032904148784, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.5917588706600535e-05, |
|
"loss": 1.0745, |
|
"mean_token_accuracy": 0.681308564543724, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.614210777300906, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 1.590487091440926e-05, |
|
"loss": 1.0767, |
|
"mean_token_accuracy": 0.7030037745833397, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6161182641869337, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.5892153122217982e-05, |
|
"loss": 1.1064, |
|
"mean_token_accuracy": 0.6708254374563694, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.5879435330026708e-05, |
|
"loss": 1.13, |
|
"mean_token_accuracy": 0.6800018325448036, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.619933237958989, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.5866717537835433e-05, |
|
"loss": 1.0084, |
|
"mean_token_accuracy": 0.7097429156303405, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6218407248450167, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.585399974564416e-05, |
|
"loss": 1.062, |
|
"mean_token_accuracy": 0.7025215804576874, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6237482117310443, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.5841281953452884e-05, |
|
"loss": 1.1089, |
|
"mean_token_accuracy": 0.6919012367725372, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.625655698617072, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.5828564161261606e-05, |
|
"loss": 1.1117, |
|
"mean_token_accuracy": 0.6917063646018505, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6275631855030996, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.5815846369070332e-05, |
|
"loss": 1.1046, |
|
"mean_token_accuracy": 0.6739561937749385, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6294706723891274, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.5803128576879054e-05, |
|
"loss": 1.1264, |
|
"mean_token_accuracy": 0.6709033444523811, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.631378159275155, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.579041078468778e-05, |
|
"loss": 1.2637, |
|
"mean_token_accuracy": 0.669117308408022, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.6332856461611827, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 1.5777692992496505e-05, |
|
"loss": 1.1087, |
|
"mean_token_accuracy": 0.6606097273528576, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6351931330472103, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.5764975200305227e-05, |
|
"loss": 1.107, |
|
"mean_token_accuracy": 0.676217395812273, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.637100619933238, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.5752257408113953e-05, |
|
"loss": 1.153, |
|
"mean_token_accuracy": 0.6523927465081215, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6390081068192656, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.5739539615922678e-05, |
|
"loss": 1.2302, |
|
"mean_token_accuracy": 0.6501938365399837, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6409155937052933, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.57268218237314e-05, |
|
"loss": 1.0566, |
|
"mean_token_accuracy": 0.6942887641489506, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.642823080591321, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.5714104031540126e-05, |
|
"loss": 1.224, |
|
"mean_token_accuracy": 0.6493762314319611, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.6447305674773486, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.570138623934885e-05, |
|
"loss": 1.0819, |
|
"mean_token_accuracy": 0.6909515604376792, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6466380543633763, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.5688668447157577e-05, |
|
"loss": 1.0433, |
|
"mean_token_accuracy": 0.6917465507984162, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.6485455412494039, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.56759506549663e-05, |
|
"loss": 1.1417, |
|
"mean_token_accuracy": 0.6675561875104904, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6504530281354316, |
|
"grad_norm": 15.875, |
|
"learning_rate": 1.5663232862775024e-05, |
|
"loss": 1.1317, |
|
"mean_token_accuracy": 0.6959464140236378, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.6523605150214592, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.5650515070583746e-05, |
|
"loss": 1.0547, |
|
"mean_token_accuracy": 0.6967534452676774, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6542680019074869, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.5637797278392472e-05, |
|
"loss": 1.2029, |
|
"mean_token_accuracy": 0.6825399219989776, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.6561754887935145, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.5625079486201197e-05, |
|
"loss": 1.2765, |
|
"mean_token_accuracy": 0.6506742417812348, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6580829756795422, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.561236169400992e-05, |
|
"loss": 1.1471, |
|
"mean_token_accuracy": 0.6708492331206799, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.6599904625655698, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.5599643901818645e-05, |
|
"loss": 0.9993, |
|
"mean_token_accuracy": 0.7227230161428452, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6618979494515975, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.558692610962737e-05, |
|
"loss": 1.155, |
|
"mean_token_accuracy": 0.6793848961591721, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.6638054363376252, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.5574208317436096e-05, |
|
"loss": 1.1064, |
|
"mean_token_accuracy": 0.6699606157839298, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6657129232236528, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.5561490525244818e-05, |
|
"loss": 1.0567, |
|
"mean_token_accuracy": 0.704989293217659, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.6676204101096805, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.5548772733053544e-05, |
|
"loss": 1.0546, |
|
"mean_token_accuracy": 0.6963534206151962, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6695278969957081, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.553605494086227e-05, |
|
"loss": 1.1009, |
|
"mean_token_accuracy": 0.6913698375225067, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.6714353838817358, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.552333714867099e-05, |
|
"loss": 1.1649, |
|
"mean_token_accuracy": 0.6772311896085739, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6733428707677634, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.5510619356479717e-05, |
|
"loss": 1.0516, |
|
"mean_token_accuracy": 0.6774869538843632, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.6752503576537912, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.549790156428844e-05, |
|
"loss": 1.1433, |
|
"mean_token_accuracy": 0.6827952593564988, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6771578445398188, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.5485183772097164e-05, |
|
"loss": 1.2089, |
|
"mean_token_accuracy": 0.6664685860276223, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6790653314258465, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.547246597990589e-05, |
|
"loss": 1.108, |
|
"mean_token_accuracy": 0.6785644575953483, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6809728183118741, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.5459748187714615e-05, |
|
"loss": 1.123, |
|
"mean_token_accuracy": 0.6967614680528641, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.6828803051979018, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.5447030395523338e-05, |
|
"loss": 1.0844, |
|
"mean_token_accuracy": 0.6894414715468884, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6847877920839294, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.5434312603332063e-05, |
|
"loss": 1.211, |
|
"mean_token_accuracy": 0.6685944899916649, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.542159481114079e-05, |
|
"loss": 1.1606, |
|
"mean_token_accuracy": 0.6708570115268231, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6886027658559848, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.540887701894951e-05, |
|
"loss": 1.1955, |
|
"mean_token_accuracy": 0.6612754940986634, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.6905102527420124, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.5396159226758236e-05, |
|
"loss": 1.1045, |
|
"mean_token_accuracy": 0.7275897234678268, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6924177396280401, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.538344143456696e-05, |
|
"loss": 1.1093, |
|
"mean_token_accuracy": 0.7059011451900006, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.6943252265140677, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.5370723642375684e-05, |
|
"loss": 1.0552, |
|
"mean_token_accuracy": 0.7078164145350456, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6962327134000954, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.535800585018441e-05, |
|
"loss": 1.1243, |
|
"mean_token_accuracy": 0.6750999264419079, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.698140200286123, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.534528805799313e-05, |
|
"loss": 1.0626, |
|
"mean_token_accuracy": 0.6938546419143676, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7000476871721507, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.5332570265801857e-05, |
|
"loss": 1.0889, |
|
"mean_token_accuracy": 0.6872342020273209, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7019551740581783, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.5319852473610582e-05, |
|
"loss": 1.0794, |
|
"mean_token_accuracy": 0.7006407782435418, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.703862660944206, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.5307134681419308e-05, |
|
"loss": 1.1733, |
|
"mean_token_accuracy": 0.6737750858068466, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.7057701478302336, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.5294416889228033e-05, |
|
"loss": 1.0611, |
|
"mean_token_accuracy": 0.7101491972804069, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7076776347162613, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.5281699097036755e-05, |
|
"loss": 1.197, |
|
"mean_token_accuracy": 0.6738182365894317, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.709585121602289, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.526898130484548e-05, |
|
"loss": 0.9995, |
|
"mean_token_accuracy": 0.7093352362513542, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7114926084883166, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.5256263512654203e-05, |
|
"loss": 1.1021, |
|
"mean_token_accuracy": 0.7152903437614441, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.7134000953743443, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.5243545720462929e-05, |
|
"loss": 1.1298, |
|
"mean_token_accuracy": 0.6734328977763653, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7153075822603719, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.5230827928271654e-05, |
|
"loss": 1.0921, |
|
"mean_token_accuracy": 0.6812200739979744, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.7172150691463997, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.5218110136080378e-05, |
|
"loss": 1.0641, |
|
"mean_token_accuracy": 0.6840920761227608, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7191225560324273, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.5205392343889103e-05, |
|
"loss": 1.1002, |
|
"mean_token_accuracy": 0.6968449607491494, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.721030042918455, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.5192674551697825e-05, |
|
"loss": 1.1009, |
|
"mean_token_accuracy": 0.6776654615998268, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7229375298044826, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.5179956759506551e-05, |
|
"loss": 1.1619, |
|
"mean_token_accuracy": 0.6722353339195252, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.7248450166905103, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 1.5167238967315275e-05, |
|
"loss": 1.0884, |
|
"mean_token_accuracy": 0.7083397641777992, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7267525035765379, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.5154521175124e-05, |
|
"loss": 1.1197, |
|
"mean_token_accuracy": 0.7005624413490296, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.7286599904625656, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.5141803382932724e-05, |
|
"loss": 1.1168, |
|
"mean_token_accuracy": 0.6706736356019973, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7305674773485932, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.5129085590741448e-05, |
|
"loss": 1.1169, |
|
"mean_token_accuracy": 0.6706511251628399, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.7324749642346209, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.5116367798550173e-05, |
|
"loss": 1.0641, |
|
"mean_token_accuracy": 0.7128054112195968, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7343824511206486, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.5103650006358897e-05, |
|
"loss": 1.0722, |
|
"mean_token_accuracy": 0.6964303895831108, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.7362899380066762, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.5090932214167621e-05, |
|
"loss": 1.1099, |
|
"mean_token_accuracy": 0.670535609871149, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7381974248927039, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.5078214421976347e-05, |
|
"loss": 1.1376, |
|
"mean_token_accuracy": 0.6716626837849617, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.7401049117787315, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.506549662978507e-05, |
|
"loss": 1.1237, |
|
"mean_token_accuracy": 0.6802854061126709, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7420123986647592, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.5052778837593796e-05, |
|
"loss": 1.1543, |
|
"mean_token_accuracy": 0.6751244008541107, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.7439198855507868, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.5040061045402518e-05, |
|
"loss": 1.0762, |
|
"mean_token_accuracy": 0.682145906984806, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7458273724368145, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.5027343253211243e-05, |
|
"loss": 1.1921, |
|
"mean_token_accuracy": 0.6897265024483203, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.7477348593228421, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.5014625461019967e-05, |
|
"loss": 1.1296, |
|
"mean_token_accuracy": 0.6912665694952012, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7496423462088698, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.5001907668828693e-05, |
|
"loss": 0.997, |
|
"mean_token_accuracy": 0.7142476089298725, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.7515498330948974, |
|
"grad_norm": 15.375, |
|
"learning_rate": 1.4989189876637418e-05, |
|
"loss": 1.105, |
|
"mean_token_accuracy": 0.6805791586637497, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7534573199809251, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.497647208444614e-05, |
|
"loss": 1.1086, |
|
"mean_token_accuracy": 0.6618591591715812, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.4963754292254866e-05, |
|
"loss": 1.2405, |
|
"mean_token_accuracy": 0.6381066031754017, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7572722937529804, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.495103650006359e-05, |
|
"loss": 1.0646, |
|
"mean_token_accuracy": 0.6994791410863399, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.7591797806390082, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.4938318707872315e-05, |
|
"loss": 1.0498, |
|
"mean_token_accuracy": 0.721826684474945, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7610872675250357, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.492560091568104e-05, |
|
"loss": 1.0933, |
|
"mean_token_accuracy": 0.677222141623497, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.7629947544110635, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.4912883123489763e-05, |
|
"loss": 1.0058, |
|
"mean_token_accuracy": 0.7074526056647301, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 7863, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.3261800470707405e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|