|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998234442495793, |
|
"eval_steps": 1000, |
|
"global_step": 5309, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009416306689108865, |
|
"grad_norm": 13.400167465209961, |
|
"learning_rate": 1e-05, |
|
"loss": 4.0168, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001883261337821773, |
|
"grad_norm": 12.05469036102295, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4972, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0028248920067326594, |
|
"grad_norm": 11.709904670715332, |
|
"learning_rate": 1e-05, |
|
"loss": 3.5447, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.003766522675643546, |
|
"grad_norm": 10.864925384521484, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4348, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004708153344554432, |
|
"grad_norm": 12.948342323303223, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4288, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.005649784013465319, |
|
"grad_norm": 10.90282917022705, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4199, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006591414682376205, |
|
"grad_norm": 10.574660301208496, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4564, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.007533045351287092, |
|
"grad_norm": 12.737661361694336, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4071, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008474676020197977, |
|
"grad_norm": 11.787413597106934, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3169, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.009416306689108865, |
|
"grad_norm": 11.487077713012695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4083, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01035793735801975, |
|
"grad_norm": 10.931989669799805, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3384, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.011299568026930638, |
|
"grad_norm": 11.073975563049316, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2959, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.012241198695841523, |
|
"grad_norm": 10.520795822143555, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3513, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01318282936475241, |
|
"grad_norm": 10.708564758300781, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1843, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.014124460033663296, |
|
"grad_norm": 9.278353691101074, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2907, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.015066090702574184, |
|
"grad_norm": 9.356634140014648, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3033, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01600772137148507, |
|
"grad_norm": 10.317462921142578, |
|
"learning_rate": 1e-05, |
|
"loss": 3.402, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.016949352040395955, |
|
"grad_norm": 9.032744407653809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3137, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.017890982709306842, |
|
"grad_norm": 9.985954284667969, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3074, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01883261337821773, |
|
"grad_norm": 9.465397834777832, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3329, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.019774244047128613, |
|
"grad_norm": 9.615800857543945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3839, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0207158747160395, |
|
"grad_norm": 9.558786392211914, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3383, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.021657505384950388, |
|
"grad_norm": 10.330078125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2845, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.022599136053861275, |
|
"grad_norm": 9.462120056152344, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2918, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02354076672277216, |
|
"grad_norm": 10.415687561035156, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3509, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.024482397391683047, |
|
"grad_norm": 9.891234397888184, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3196, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.025424028060593934, |
|
"grad_norm": 10.6549072265625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2702, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02636565872950482, |
|
"grad_norm": 10.514317512512207, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3639, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.027307289398415705, |
|
"grad_norm": 10.233137130737305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3693, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.028248920067326592, |
|
"grad_norm": 9.689544677734375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2754, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02919055073623748, |
|
"grad_norm": 10.0300931930542, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2058, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.030132181405148367, |
|
"grad_norm": 9.759441375732422, |
|
"learning_rate": 1e-05, |
|
"loss": 3.216, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03107381207405925, |
|
"grad_norm": 9.909936904907227, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3169, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.03201544274297014, |
|
"grad_norm": 10.085428237915039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3579, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.032957073411881026, |
|
"grad_norm": 11.790485382080078, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3376, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03389870408079191, |
|
"grad_norm": 9.588286399841309, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2441, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0348403347497028, |
|
"grad_norm": 9.073866844177246, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3118, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.035781965418613684, |
|
"grad_norm": 11.346445083618164, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2795, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03672359608752457, |
|
"grad_norm": 8.872209548950195, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1868, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.03766522675643546, |
|
"grad_norm": 10.55789566040039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2716, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03860685742534634, |
|
"grad_norm": 9.99712085723877, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2368, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.03954848809425723, |
|
"grad_norm": 10.883671760559082, |
|
"learning_rate": 1e-05, |
|
"loss": 3.237, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04049011876316812, |
|
"grad_norm": 10.810287475585938, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2724, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.041431749432079, |
|
"grad_norm": 8.746783256530762, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2741, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04237338010098989, |
|
"grad_norm": 10.720795631408691, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2725, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.043315010769900776, |
|
"grad_norm": 10.715919494628906, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2604, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04425664143881166, |
|
"grad_norm": 9.999095916748047, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3685, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.04519827210772255, |
|
"grad_norm": 9.06040096282959, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3883, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.046139902776633435, |
|
"grad_norm": 9.397573471069336, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2377, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.04708153344554432, |
|
"grad_norm": 11.011545181274414, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1805, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04802316411445521, |
|
"grad_norm": 8.947606086730957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1641, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.04896479478336609, |
|
"grad_norm": 9.644259452819824, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2127, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.049906425452276984, |
|
"grad_norm": 9.601791381835938, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2503, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.05084805612118787, |
|
"grad_norm": 9.706978797912598, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2276, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05178968679009875, |
|
"grad_norm": 10.092705726623535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2853, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.05273131745900964, |
|
"grad_norm": 10.306342124938965, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3613, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.053672948127920526, |
|
"grad_norm": 9.216752052307129, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1688, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.05461457879683141, |
|
"grad_norm": 9.452301979064941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2417, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0555562094657423, |
|
"grad_norm": 9.074745178222656, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3112, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.056497840134653185, |
|
"grad_norm": 8.492777824401855, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2811, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05743947080356407, |
|
"grad_norm": 9.102340698242188, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2543, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.05838110147247496, |
|
"grad_norm": 9.913708686828613, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3047, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.059322732141385844, |
|
"grad_norm": 8.82358455657959, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2744, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.060264362810296734, |
|
"grad_norm": 10.01116943359375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3184, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06120599347920762, |
|
"grad_norm": 8.861343383789062, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3169, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0621476241481185, |
|
"grad_norm": 8.552498817443848, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3267, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06308925481702939, |
|
"grad_norm": 9.180069923400879, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1593, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.06403088548594028, |
|
"grad_norm": 8.194535255432129, |
|
"learning_rate": 1e-05, |
|
"loss": 3.252, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06497251615485117, |
|
"grad_norm": 9.234159469604492, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3233, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.06591414682376205, |
|
"grad_norm": 9.615317344665527, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1951, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06685577749267294, |
|
"grad_norm": 9.555344581604004, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3024, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.06779740816158382, |
|
"grad_norm": 9.033773422241211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2833, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0687390388304947, |
|
"grad_norm": 10.110182762145996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3022, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.0696806694994056, |
|
"grad_norm": 8.350312232971191, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1566, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07062230016831648, |
|
"grad_norm": 9.642204284667969, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2279, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.07156393083722737, |
|
"grad_norm": 8.922101020812988, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1227, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07250556150613825, |
|
"grad_norm": 8.6968412399292, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2537, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.07344719217504914, |
|
"grad_norm": 9.628043174743652, |
|
"learning_rate": 1e-05, |
|
"loss": 3.335, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07438882284396003, |
|
"grad_norm": 9.418974876403809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2609, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.07533045351287092, |
|
"grad_norm": 9.786445617675781, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1916, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0762720841817818, |
|
"grad_norm": 9.372411727905273, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2404, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.07721371485069269, |
|
"grad_norm": 9.255708694458008, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1984, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07815534551960357, |
|
"grad_norm": 8.31712818145752, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2435, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.07909697618851445, |
|
"grad_norm": 9.45557975769043, |
|
"learning_rate": 1e-05, |
|
"loss": 3.271, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08003860685742535, |
|
"grad_norm": 9.2068510055542, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2629, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.08098023752633624, |
|
"grad_norm": 9.854654312133789, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2466, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08192186819524712, |
|
"grad_norm": 9.899443626403809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.303, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.082863498864158, |
|
"grad_norm": 8.167075157165527, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1332, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08380512953306889, |
|
"grad_norm": 9.357007026672363, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2161, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.08474676020197978, |
|
"grad_norm": 8.328939437866211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2558, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08568839087089067, |
|
"grad_norm": 10.344287872314453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1148, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.08663002153980155, |
|
"grad_norm": 8.459121704101562, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2366, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08757165220871244, |
|
"grad_norm": 9.339311599731445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3596, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.08851328287762332, |
|
"grad_norm": 9.49207592010498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2823, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0894549135465342, |
|
"grad_norm": 9.24524974822998, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2593, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.0903965442154451, |
|
"grad_norm": 9.082175254821777, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1761, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09133817488435599, |
|
"grad_norm": 8.596846580505371, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2005, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.09227980555326687, |
|
"grad_norm": 9.297995567321777, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2414, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09322143622217775, |
|
"grad_norm": 9.254470825195312, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2309, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.09416306689108864, |
|
"grad_norm": 9.504804611206055, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2978, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09510469755999953, |
|
"grad_norm": 9.160736083984375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2522, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.09604632822891042, |
|
"grad_norm": 8.80601692199707, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1128, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0969879588978213, |
|
"grad_norm": 9.246788024902344, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2469, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.09792958956673219, |
|
"grad_norm": 8.799399375915527, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1663, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.09887122023564307, |
|
"grad_norm": 8.722356796264648, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2616, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.09981285090455397, |
|
"grad_norm": 9.724103927612305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2367, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.10075448157346485, |
|
"grad_norm": 9.14875602722168, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1932, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.10169611224237574, |
|
"grad_norm": 8.882226943969727, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1863, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.10263774291128662, |
|
"grad_norm": 8.96247673034668, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3411, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.1035793735801975, |
|
"grad_norm": 7.7259202003479, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2209, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10452100424910839, |
|
"grad_norm": 8.8307466506958, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1653, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.10546263491801929, |
|
"grad_norm": 9.439279556274414, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2451, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.10640426558693017, |
|
"grad_norm": 8.951433181762695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2327, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.10734589625584105, |
|
"grad_norm": 8.996528625488281, |
|
"learning_rate": 1e-05, |
|
"loss": 3.177, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.10828752692475194, |
|
"grad_norm": 8.661666870117188, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2389, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.10922915759366282, |
|
"grad_norm": 7.827404499053955, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2133, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.11017078826257372, |
|
"grad_norm": 8.948150634765625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1059, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1111124189314846, |
|
"grad_norm": 9.571684837341309, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2737, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11205404960039549, |
|
"grad_norm": 8.415332794189453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.181, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.11299568026930637, |
|
"grad_norm": 8.199889183044434, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1989, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11393731093821725, |
|
"grad_norm": 8.935426712036133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.165, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.11487894160712814, |
|
"grad_norm": 9.254772186279297, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2167, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.11582057227603904, |
|
"grad_norm": 8.660184860229492, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1573, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.11676220294494992, |
|
"grad_norm": 9.153916358947754, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2069, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1177038336138608, |
|
"grad_norm": 10.136688232421875, |
|
"learning_rate": 1e-05, |
|
"loss": 3.184, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.11864546428277169, |
|
"grad_norm": 8.210442543029785, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2377, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.11958709495168257, |
|
"grad_norm": 10.24669361114502, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2338, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.12052872562059347, |
|
"grad_norm": 8.631328582763672, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3123, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.12147035628950435, |
|
"grad_norm": 8.293212890625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1506, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.12241198695841524, |
|
"grad_norm": 9.084228515625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1423, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12335361762732612, |
|
"grad_norm": 7.838293552398682, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0779, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.124295248296237, |
|
"grad_norm": 10.947461128234863, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1623, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1252368789651479, |
|
"grad_norm": 9.293968200683594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2047, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.12617850963405877, |
|
"grad_norm": 8.150403022766113, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1688, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.12712014030296967, |
|
"grad_norm": 8.54336929321289, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2182, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.12806177097188057, |
|
"grad_norm": 7.81638240814209, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2553, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.12900340164079144, |
|
"grad_norm": 8.324007034301758, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1702, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.12994503230970234, |
|
"grad_norm": 9.187409400939941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1752, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.1308866629786132, |
|
"grad_norm": 8.79953670501709, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2211, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.1318282936475241, |
|
"grad_norm": 9.9981689453125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1846, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13276992431643497, |
|
"grad_norm": 8.74207592010498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2351, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.13371155498534587, |
|
"grad_norm": 8.988236427307129, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2312, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.13465318565425677, |
|
"grad_norm": 8.005229949951172, |
|
"learning_rate": 1e-05, |
|
"loss": 3.218, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.13559481632316764, |
|
"grad_norm": 9.276175498962402, |
|
"learning_rate": 1e-05, |
|
"loss": 3.279, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.13653644699207854, |
|
"grad_norm": 8.09757137298584, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2876, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.1374780776609894, |
|
"grad_norm": 8.40072250366211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2547, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.1384197083299003, |
|
"grad_norm": 8.614055633544922, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2795, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.1393613389988112, |
|
"grad_norm": 9.18989086151123, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0988, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.14030296966772207, |
|
"grad_norm": 8.401784896850586, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1612, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.14124460033663297, |
|
"grad_norm": 8.722879409790039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1921, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14218623100554384, |
|
"grad_norm": 8.14240550994873, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2371, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.14312786167445474, |
|
"grad_norm": 10.812965393066406, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2239, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.14406949234336563, |
|
"grad_norm": 8.65410327911377, |
|
"learning_rate": 1e-05, |
|
"loss": 3.267, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.1450111230122765, |
|
"grad_norm": 7.997138977050781, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2387, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.1459527536811874, |
|
"grad_norm": 8.726889610290527, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1542, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.14689438435009827, |
|
"grad_norm": 8.375578880310059, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1742, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.14783601501900917, |
|
"grad_norm": 8.577352523803711, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2441, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.14877764568792007, |
|
"grad_norm": 8.30477523803711, |
|
"learning_rate": 1e-05, |
|
"loss": 3.133, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.14971927635683094, |
|
"grad_norm": 8.190656661987305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1822, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.15066090702574184, |
|
"grad_norm": 8.631675720214844, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2068, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1516025376946527, |
|
"grad_norm": 9.275361061096191, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2194, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.1525441683635636, |
|
"grad_norm": 8.464102745056152, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1456, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.1534857990324745, |
|
"grad_norm": 8.44454288482666, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1675, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.15442742970138537, |
|
"grad_norm": 8.282485961914062, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1728, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.15536906037029627, |
|
"grad_norm": 8.142829895019531, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2012, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.15631069103920714, |
|
"grad_norm": 8.29863452911377, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1008, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.15725232170811804, |
|
"grad_norm": 8.145419120788574, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0711, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.1581939523770289, |
|
"grad_norm": 8.715362548828125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1625, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.1591355830459398, |
|
"grad_norm": 8.283182144165039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.066, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.1600772137148507, |
|
"grad_norm": 9.210651397705078, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2498, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.16101884438376157, |
|
"grad_norm": 8.161669731140137, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2414, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.16196047505267247, |
|
"grad_norm": 8.109793663024902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1307, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.16290210572158334, |
|
"grad_norm": 8.473872184753418, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1473, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.16384373639049424, |
|
"grad_norm": 8.496044158935547, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1971, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.16478536705940514, |
|
"grad_norm": 7.473423004150391, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1617, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.165726997728316, |
|
"grad_norm": 7.770011901855469, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1806, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.1666686283972269, |
|
"grad_norm": 8.22207260131836, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2673, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.16761025906613777, |
|
"grad_norm": 9.465033531188965, |
|
"learning_rate": 1e-05, |
|
"loss": 3.273, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.16855188973504867, |
|
"grad_norm": 8.1371431350708, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1663, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.16949352040395957, |
|
"grad_norm": 8.64779281616211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1979, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17043515107287044, |
|
"grad_norm": 8.860164642333984, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2013, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.17137678174178134, |
|
"grad_norm": 8.785137176513672, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1721, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.1723184124106922, |
|
"grad_norm": 8.557008743286133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1774, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.1732600430796031, |
|
"grad_norm": 8.913068771362305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.095, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.174201673748514, |
|
"grad_norm": 7.609038352966309, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1806, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.17514330441742487, |
|
"grad_norm": 7.761782169342041, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2352, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.17608493508633577, |
|
"grad_norm": 8.180496215820312, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2167, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.17702656575524664, |
|
"grad_norm": 8.675491333007812, |
|
"learning_rate": 1e-05, |
|
"loss": 3.12, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.17796819642415754, |
|
"grad_norm": 8.088050842285156, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1027, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.1789098270930684, |
|
"grad_norm": 8.452052116394043, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9981, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1798514577619793, |
|
"grad_norm": 8.1648588180542, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0539, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.1807930884308902, |
|
"grad_norm": 8.245767593383789, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1362, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.18173471909980107, |
|
"grad_norm": 8.551701545715332, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1821, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.18267634976871197, |
|
"grad_norm": 8.178792953491211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1523, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.18361798043762284, |
|
"grad_norm": 8.187594413757324, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2531, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.18455961110653374, |
|
"grad_norm": 7.719356060028076, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1006, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.18550124177544464, |
|
"grad_norm": 8.289334297180176, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1259, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.1864428724443555, |
|
"grad_norm": 8.719657897949219, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1813, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.1873845031132664, |
|
"grad_norm": 8.192325592041016, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1389, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.18832613378217727, |
|
"grad_norm": 7.759548664093018, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1203, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18832613378217727, |
|
"eval_accuracy": 0.3959961907900079, |
|
"eval_loss": 3.179180860519409, |
|
"eval_runtime": 1109.8083, |
|
"eval_samples_per_second": 34.024, |
|
"eval_steps_per_second": 8.506, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18926776445108817, |
|
"grad_norm": 8.558948516845703, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2909, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.19020939511999907, |
|
"grad_norm": 7.582333087921143, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1388, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.19115102578890994, |
|
"grad_norm": 8.941636085510254, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1176, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.19209265645782084, |
|
"grad_norm": 8.161066055297852, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1302, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1930342871267317, |
|
"grad_norm": 8.573451042175293, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1978, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.1939759177956426, |
|
"grad_norm": 9.676179885864258, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2441, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.1949175484645535, |
|
"grad_norm": 9.518370628356934, |
|
"learning_rate": 1e-05, |
|
"loss": 3.209, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.19585917913346437, |
|
"grad_norm": 8.120454788208008, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0915, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.19680080980237527, |
|
"grad_norm": 8.362229347229004, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2262, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.19774244047128614, |
|
"grad_norm": 8.197416305541992, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1164, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.19868407114019704, |
|
"grad_norm": 8.58997631072998, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2323, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.19962570180910794, |
|
"grad_norm": 7.581511497497559, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1852, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2005673324780188, |
|
"grad_norm": 7.385310649871826, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2516, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.2015089631469297, |
|
"grad_norm": 7.678237438201904, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1669, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.20245059381584057, |
|
"grad_norm": 8.25271224975586, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1575, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.20339222448475147, |
|
"grad_norm": 7.587473392486572, |
|
"learning_rate": 1e-05, |
|
"loss": 3.182, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.20433385515366234, |
|
"grad_norm": 7.0264153480529785, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1049, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.20527548582257324, |
|
"grad_norm": 7.00242280960083, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0769, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.20621711649148414, |
|
"grad_norm": 8.321081161499023, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0977, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.207158747160395, |
|
"grad_norm": 8.19550609588623, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1317, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2081003778293059, |
|
"grad_norm": 7.681639671325684, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2471, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.20904200849821677, |
|
"grad_norm": 8.00836181640625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.183, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.20998363916712767, |
|
"grad_norm": 7.7261576652526855, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2104, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.21092526983603857, |
|
"grad_norm": 7.512089729309082, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0842, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.21186690050494944, |
|
"grad_norm": 8.911776542663574, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2527, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.21280853117386034, |
|
"grad_norm": 7.831639766693115, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2335, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2137501618427712, |
|
"grad_norm": 7.919592380523682, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1974, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.2146917925116821, |
|
"grad_norm": 7.262505531311035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0964, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.215633423180593, |
|
"grad_norm": 7.787773132324219, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1731, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.21657505384950387, |
|
"grad_norm": 8.574644088745117, |
|
"learning_rate": 1e-05, |
|
"loss": 3.203, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.21751668451841477, |
|
"grad_norm": 8.63223648071289, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0797, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.21845831518732564, |
|
"grad_norm": 7.836942195892334, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1964, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.21939994585623654, |
|
"grad_norm": 8.048019409179688, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0157, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.22034157652514744, |
|
"grad_norm": 9.863574981689453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1937, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2212832071940583, |
|
"grad_norm": 7.458190441131592, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1185, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.2222248378629692, |
|
"grad_norm": 7.304734230041504, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0673, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.22316646853188007, |
|
"grad_norm": 8.21141529083252, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1101, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.22410809920079097, |
|
"grad_norm": 8.499639511108398, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2017, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.22504972986970187, |
|
"grad_norm": 8.153023719787598, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1853, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.22599136053861274, |
|
"grad_norm": 8.166133880615234, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0598, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.22693299120752364, |
|
"grad_norm": 8.11247730255127, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2231, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.2278746218764345, |
|
"grad_norm": 9.046285629272461, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0795, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.2288162525453454, |
|
"grad_norm": 8.127588272094727, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1155, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.22975788321425628, |
|
"grad_norm": 7.537458419799805, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1393, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.23069951388316717, |
|
"grad_norm": 7.949858665466309, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1817, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.23164114455207807, |
|
"grad_norm": 8.178318977355957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1628, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.23258277522098894, |
|
"grad_norm": 7.652297019958496, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1317, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.23352440588989984, |
|
"grad_norm": 7.234852313995361, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1783, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2344660365588107, |
|
"grad_norm": 7.958459854125977, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1752, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.2354076672277216, |
|
"grad_norm": 7.870058536529541, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2413, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2363492978966325, |
|
"grad_norm": 8.846611976623535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1678, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.23729092856554337, |
|
"grad_norm": 8.129776000976562, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1669, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.23823255923445427, |
|
"grad_norm": 6.927892208099365, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1201, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.23917418990336514, |
|
"grad_norm": 9.028277397155762, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1909, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.24011582057227604, |
|
"grad_norm": 8.353940963745117, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2572, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.24105745124118694, |
|
"grad_norm": 7.9163737297058105, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1206, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2419990819100978, |
|
"grad_norm": 8.831631660461426, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2485, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.2429407125790087, |
|
"grad_norm": 7.537317752838135, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1398, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.24388234324791958, |
|
"grad_norm": 7.471547603607178, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0652, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.24482397391683047, |
|
"grad_norm": 7.851377487182617, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0686, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.24576560458574137, |
|
"grad_norm": 8.015359878540039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2359, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.24670723525465224, |
|
"grad_norm": 8.371912002563477, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0857, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.24764886592356314, |
|
"grad_norm": 7.510787487030029, |
|
"learning_rate": 1e-05, |
|
"loss": 3.244, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.248590496592474, |
|
"grad_norm": 7.756171703338623, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1444, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.2495321272613849, |
|
"grad_norm": 8.038382530212402, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2096, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.2504737579302958, |
|
"grad_norm": 7.765430927276611, |
|
"learning_rate": 1e-05, |
|
"loss": 3.139, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.2514153885992067, |
|
"grad_norm": 8.474689483642578, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1436, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.25235701926811754, |
|
"grad_norm": 9.304710388183594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1479, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.25329864993702844, |
|
"grad_norm": 10.669320106506348, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0753, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.25424028060593934, |
|
"grad_norm": 6.909900665283203, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0703, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.25518191127485024, |
|
"grad_norm": 8.000332832336426, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0837, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.25612354194376113, |
|
"grad_norm": 7.857721328735352, |
|
"learning_rate": 1e-05, |
|
"loss": 3.121, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.257065172612672, |
|
"grad_norm": 7.735369682312012, |
|
"learning_rate": 1e-05, |
|
"loss": 3.122, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.2580068032815829, |
|
"grad_norm": 7.965452194213867, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1381, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.2589484339504938, |
|
"grad_norm": 8.671842575073242, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1744, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.25989006461940467, |
|
"grad_norm": 7.975509166717529, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1222, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.26083169528831557, |
|
"grad_norm": 7.30696964263916, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1122, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.2617733259572264, |
|
"grad_norm": 8.272529602050781, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0969, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.2627149566261373, |
|
"grad_norm": 8.145869255065918, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0817, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.2636565872950482, |
|
"grad_norm": 7.628718852996826, |
|
"learning_rate": 1e-05, |
|
"loss": 3.105, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2645982179639591, |
|
"grad_norm": 6.900229454040527, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2028, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.26553984863286995, |
|
"grad_norm": 7.9082207679748535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.014, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.26648147930178084, |
|
"grad_norm": 8.532185554504395, |
|
"learning_rate": 1e-05, |
|
"loss": 3.143, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.26742310997069174, |
|
"grad_norm": 8.344785690307617, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1865, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.26836474063960264, |
|
"grad_norm": 7.369927883148193, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0858, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.26930637130851354, |
|
"grad_norm": 7.539013385772705, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9854, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.2702480019774244, |
|
"grad_norm": 8.182465553283691, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0504, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.2711896326463353, |
|
"grad_norm": 8.410788536071777, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0354, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2721312633152462, |
|
"grad_norm": 7.914114475250244, |
|
"learning_rate": 1e-05, |
|
"loss": 3.067, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.2730728939841571, |
|
"grad_norm": 8.113722801208496, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1172, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.27401452465306797, |
|
"grad_norm": 7.887296676635742, |
|
"learning_rate": 1e-05, |
|
"loss": 3.139, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.2749561553219788, |
|
"grad_norm": 9.083703994750977, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1805, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.2758977859908897, |
|
"grad_norm": 7.937193393707275, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1389, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.2768394166598006, |
|
"grad_norm": 8.576871871948242, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0739, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.2777810473287115, |
|
"grad_norm": 8.345330238342285, |
|
"learning_rate": 1e-05, |
|
"loss": 3.043, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.2787226779976224, |
|
"grad_norm": 8.3008394241333, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2218, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.27966430866653325, |
|
"grad_norm": 7.56035852432251, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1249, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.28060593933544414, |
|
"grad_norm": 7.499727249145508, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0944, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.28154757000435504, |
|
"grad_norm": 8.389445304870605, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0844, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.28248920067326594, |
|
"grad_norm": 9.202953338623047, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2153, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.28343083134217684, |
|
"grad_norm": 7.818172454833984, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0676, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.2843724620110877, |
|
"grad_norm": 7.743332386016846, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0716, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.2853140926799986, |
|
"grad_norm": 8.619895935058594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.142, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.2862557233489095, |
|
"grad_norm": 7.999989986419678, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2461, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.28719735401782037, |
|
"grad_norm": 7.5509772300720215, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1051, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.28813898468673127, |
|
"grad_norm": 7.679853916168213, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1131, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.2890806153556421, |
|
"grad_norm": 7.623820781707764, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1323, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.290022246024553, |
|
"grad_norm": 7.860290050506592, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1595, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.2909638766934639, |
|
"grad_norm": 7.90553092956543, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0943, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.2919055073623748, |
|
"grad_norm": 8.377348899841309, |
|
"learning_rate": 1e-05, |
|
"loss": 3.194, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.2928471380312857, |
|
"grad_norm": 7.801243305206299, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0033, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.29378876870019655, |
|
"grad_norm": 8.063822746276855, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1661, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.29473039936910744, |
|
"grad_norm": 7.1917314529418945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1411, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.29567203003801834, |
|
"grad_norm": 7.78692102432251, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0471, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.29661366070692924, |
|
"grad_norm": 8.212677001953125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1417, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.29755529137584014, |
|
"grad_norm": 6.906081676483154, |
|
"learning_rate": 1e-05, |
|
"loss": 3.13, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.298496922044751, |
|
"grad_norm": 7.585642337799072, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1489, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.2994385527136619, |
|
"grad_norm": 7.452507495880127, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1203, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.3003801833825728, |
|
"grad_norm": 9.468456268310547, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1018, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.30132181405148367, |
|
"grad_norm": 8.016668319702148, |
|
"learning_rate": 1e-05, |
|
"loss": 3.209, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.30226344472039457, |
|
"grad_norm": 7.062180995941162, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2921, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.3032050753893054, |
|
"grad_norm": 7.844501495361328, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1812, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.3041467060582163, |
|
"grad_norm": 6.8861308097839355, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0092, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.3050883367271272, |
|
"grad_norm": 7.58292818069458, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1616, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.3060299673960381, |
|
"grad_norm": 7.547516822814941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1064, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.306971598064949, |
|
"grad_norm": 7.924961090087891, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1486, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.30791322873385985, |
|
"grad_norm": 9.10555362701416, |
|
"learning_rate": 1e-05, |
|
"loss": 2.995, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.30885485940277074, |
|
"grad_norm": 7.4836931228637695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1971, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.30979649007168164, |
|
"grad_norm": 9.865035057067871, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1604, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.31073812074059254, |
|
"grad_norm": 8.200763702392578, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0831, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3116797514095034, |
|
"grad_norm": 8.044021606445312, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1665, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.3126213820784143, |
|
"grad_norm": 7.945106029510498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0876, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.3135630127473252, |
|
"grad_norm": 8.18039608001709, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1172, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.3145046434162361, |
|
"grad_norm": 7.830636024475098, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1627, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.31544627408514697, |
|
"grad_norm": 8.153444290161133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0749, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.3163879047540578, |
|
"grad_norm": 8.141214370727539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0414, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3173295354229687, |
|
"grad_norm": 7.207879543304443, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1829, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.3182711660918796, |
|
"grad_norm": 6.900830268859863, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0745, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.3192127967607905, |
|
"grad_norm": 7.9972243309021, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1691, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.3201544274297014, |
|
"grad_norm": 8.405791282653809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0937, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.32109605809861225, |
|
"grad_norm": 7.26245641708374, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0187, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.32203768876752314, |
|
"grad_norm": 8.00014591217041, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0608, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.32297931943643404, |
|
"grad_norm": 7.747166156768799, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1721, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.32392095010534494, |
|
"grad_norm": 8.84945011138916, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0493, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.32486258077425584, |
|
"grad_norm": 8.65839958190918, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0396, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.3258042114431667, |
|
"grad_norm": 8.27173900604248, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0956, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.3267458421120776, |
|
"grad_norm": 7.819243907928467, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1334, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.3276874727809885, |
|
"grad_norm": 8.155767440795898, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1312, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.3286291034498994, |
|
"grad_norm": 7.6382155418396, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9995, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.32957073411881027, |
|
"grad_norm": 7.503223896026611, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1386, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3305123647877211, |
|
"grad_norm": 8.2285737991333, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0442, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.331453995456632, |
|
"grad_norm": 7.820169925689697, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0377, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.3323956261255429, |
|
"grad_norm": 7.6100969314575195, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1311, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.3333372567944538, |
|
"grad_norm": 7.763628005981445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0599, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.3342788874633647, |
|
"grad_norm": 8.461652755737305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.222, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.33522051813227555, |
|
"grad_norm": 7.414519309997559, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0587, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.33616214880118644, |
|
"grad_norm": 7.964784145355225, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2038, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.33710377947009734, |
|
"grad_norm": 7.656503200531006, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1324, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.33804541013900824, |
|
"grad_norm": 8.035988807678223, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1101, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.33898704080791914, |
|
"grad_norm": 7.202066898345947, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0915, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.33992867147683, |
|
"grad_norm": 8.242351531982422, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2059, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.3408703021457409, |
|
"grad_norm": 7.794888019561768, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1364, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.3418119328146518, |
|
"grad_norm": 7.34774923324585, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0507, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.3427535634835627, |
|
"grad_norm": 7.673720359802246, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0383, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.34369519415247357, |
|
"grad_norm": 7.948644638061523, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1888, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.3446368248213844, |
|
"grad_norm": 7.747712135314941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.089, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.3455784554902953, |
|
"grad_norm": 7.607177257537842, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1111, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.3465200861592062, |
|
"grad_norm": 6.7083916664123535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0944, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.3474617168281171, |
|
"grad_norm": 8.02619743347168, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1129, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.348403347497028, |
|
"grad_norm": 8.047721862792969, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1843, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.34934497816593885, |
|
"grad_norm": 7.408081531524658, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0514, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.35028660883484974, |
|
"grad_norm": 7.935153484344482, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1919, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.35122823950376064, |
|
"grad_norm": 7.98247766494751, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1084, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.35216987017267154, |
|
"grad_norm": 6.453451156616211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0292, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.35311150084158244, |
|
"grad_norm": 7.293237686157227, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0266, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.3540531315104933, |
|
"grad_norm": 7.254806041717529, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1074, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3549947621794042, |
|
"grad_norm": 7.4231743812561035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1157, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.3559363928483151, |
|
"grad_norm": 7.126735210418701, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0207, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.356878023517226, |
|
"grad_norm": 7.252379417419434, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1247, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.3578196541861368, |
|
"grad_norm": 8.155769348144531, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1562, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3587612848550477, |
|
"grad_norm": 7.091341972351074, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0676, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.3597029155239586, |
|
"grad_norm": 7.706187725067139, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1094, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3606445461928695, |
|
"grad_norm": 7.110264301300049, |
|
"learning_rate": 1e-05, |
|
"loss": 3.12, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.3615861768617804, |
|
"grad_norm": 6.751636505126953, |
|
"learning_rate": 1e-05, |
|
"loss": 3.189, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.36252780753069125, |
|
"grad_norm": 7.525967597961426, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1649, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.36346943819960215, |
|
"grad_norm": 7.6558356285095215, |
|
"learning_rate": 1e-05, |
|
"loss": 3.103, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.36441106886851304, |
|
"grad_norm": 7.773464679718018, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1464, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.36535269953742394, |
|
"grad_norm": 8.413071632385254, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0434, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.36629433020633484, |
|
"grad_norm": 9.84329605102539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.077, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.3672359608752457, |
|
"grad_norm": 7.708522319793701, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0714, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3681775915441566, |
|
"grad_norm": 7.770474910736084, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1796, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.3691192222130675, |
|
"grad_norm": 6.997617244720459, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1673, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.3700608528819784, |
|
"grad_norm": 7.298404216766357, |
|
"learning_rate": 1e-05, |
|
"loss": 2.955, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.3710024835508893, |
|
"grad_norm": 7.35360050201416, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9934, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.3719441142198001, |
|
"grad_norm": 7.945688247680664, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0656, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.372885744888711, |
|
"grad_norm": 8.263771057128906, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.3738273755576219, |
|
"grad_norm": 7.973668575286865, |
|
"learning_rate": 1e-05, |
|
"loss": 3.129, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.3747690062265328, |
|
"grad_norm": 7.491722583770752, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1392, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.3757106368954437, |
|
"grad_norm": 7.867580413818359, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1261, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.37665226756435455, |
|
"grad_norm": 7.192398548126221, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1165, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37665226756435455, |
|
"eval_accuracy": 0.40667947257976045, |
|
"eval_loss": 3.105938673019409, |
|
"eval_runtime": 1037.0565, |
|
"eval_samples_per_second": 36.411, |
|
"eval_steps_per_second": 9.103, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37759389823326545, |
|
"grad_norm": 7.978450775146484, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0121, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.37853552890217634, |
|
"grad_norm": 7.5898847579956055, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0836, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.37947715957108724, |
|
"grad_norm": 7.317259311676025, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0859, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.38041879023999814, |
|
"grad_norm": 7.094460964202881, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0992, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.381360420908909, |
|
"grad_norm": 7.557990074157715, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0912, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.3823020515778199, |
|
"grad_norm": 6.940727710723877, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0669, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3832436822467308, |
|
"grad_norm": 8.001688957214355, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1704, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.3841853129156417, |
|
"grad_norm": 7.38444185256958, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0961, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3851269435845526, |
|
"grad_norm": 6.947928428649902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1942, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.3860685742534634, |
|
"grad_norm": 7.699880123138428, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0334, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.3870102049223743, |
|
"grad_norm": 7.490096092224121, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1301, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.3879518355912852, |
|
"grad_norm": 7.5343451499938965, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0588, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.3888934662601961, |
|
"grad_norm": 9.956011772155762, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0914, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.389835096929107, |
|
"grad_norm": 7.387354373931885, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1279, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.39077672759801785, |
|
"grad_norm": 7.3654351234436035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0307, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.39171835826692875, |
|
"grad_norm": 7.671773910522461, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0223, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.39265998893583964, |
|
"grad_norm": 8.229500770568848, |
|
"learning_rate": 1e-05, |
|
"loss": 3.107, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.39360161960475054, |
|
"grad_norm": 7.7551798820495605, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1048, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.39454325027366144, |
|
"grad_norm": 6.759220600128174, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0824, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.3954848809425723, |
|
"grad_norm": 8.242779731750488, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0604, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3964265116114832, |
|
"grad_norm": 7.663089752197266, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1625, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.3973681422803941, |
|
"grad_norm": 8.07699203491211, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0494, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.398309772949305, |
|
"grad_norm": 7.712186336517334, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1758, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.39925140361821587, |
|
"grad_norm": 9.279346466064453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1234, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.4001930342871267, |
|
"grad_norm": 6.985004901885986, |
|
"learning_rate": 1e-05, |
|
"loss": 3.013, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.4011346649560376, |
|
"grad_norm": 7.505834102630615, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1199, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.4020762956249485, |
|
"grad_norm": 7.714130401611328, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0762, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.4030179262938594, |
|
"grad_norm": 7.358199119567871, |
|
"learning_rate": 1e-05, |
|
"loss": 3.053, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.40395955696277025, |
|
"grad_norm": 7.569924354553223, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1715, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.40490118763168115, |
|
"grad_norm": 7.3484697341918945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.093, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.40584281830059205, |
|
"grad_norm": 7.914135456085205, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0454, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.40678444896950294, |
|
"grad_norm": 7.99644660949707, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1014, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.40772607963841384, |
|
"grad_norm": 8.003348350524902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1274, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.4086677103073247, |
|
"grad_norm": 7.18101692199707, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0354, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.4096093409762356, |
|
"grad_norm": 7.236114501953125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0585, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.4105509716451465, |
|
"grad_norm": 7.164278984069824, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0792, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.4114926023140574, |
|
"grad_norm": 7.827737808227539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1141, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.4124342329829683, |
|
"grad_norm": 10.366262435913086, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0372, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.4133758636518791, |
|
"grad_norm": 8.009645462036133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0653, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.41431749432079, |
|
"grad_norm": 8.994948387145996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1723, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4152591249897009, |
|
"grad_norm": 6.850546360015869, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1088, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.4162007556586118, |
|
"grad_norm": 7.697965145111084, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1864, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.4171423863275227, |
|
"grad_norm": 7.226153373718262, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1957, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.41808401699643355, |
|
"grad_norm": 7.277571678161621, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1946, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.41902564766534445, |
|
"grad_norm": 7.740057945251465, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0261, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.41996727833425535, |
|
"grad_norm": 7.6060028076171875, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0848, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.42090890900316624, |
|
"grad_norm": 8.158476829528809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0151, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.42185053967207714, |
|
"grad_norm": 7.340221405029297, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9477, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.422792170340988, |
|
"grad_norm": 6.894491195678711, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0536, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.4237338010098989, |
|
"grad_norm": 7.707582473754883, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0879, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4246754316788098, |
|
"grad_norm": 7.966876029968262, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0493, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.4256170623477207, |
|
"grad_norm": 7.2440185546875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9738, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.4265586930166316, |
|
"grad_norm": 7.6277337074279785, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2125, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.4275003236855424, |
|
"grad_norm": 7.30360746383667, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1131, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.4284419543544533, |
|
"grad_norm": 7.502569675445557, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0646, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.4293835850233642, |
|
"grad_norm": 7.918311595916748, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1952, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.4303252156922751, |
|
"grad_norm": 7.016907691955566, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1168, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.431266846361186, |
|
"grad_norm": 7.167459964752197, |
|
"learning_rate": 1e-05, |
|
"loss": 3.126, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.43220847703009685, |
|
"grad_norm": 7.548356533050537, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0791, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.43315010769900775, |
|
"grad_norm": 6.640120506286621, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1876, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.43409173836791864, |
|
"grad_norm": 7.116699695587158, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0427, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.43503336903682954, |
|
"grad_norm": 7.0854668617248535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1455, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.43597499970574044, |
|
"grad_norm": 7.1804423332214355, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0863, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.4369166303746513, |
|
"grad_norm": 7.1510210037231445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1586, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.4378582610435622, |
|
"grad_norm": 7.871683120727539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1336, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.4387998917124731, |
|
"grad_norm": 7.599919319152832, |
|
"learning_rate": 1e-05, |
|
"loss": 3.05, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.439741522381384, |
|
"grad_norm": 8.056229591369629, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0782, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.4406831530502949, |
|
"grad_norm": 7.520589351654053, |
|
"learning_rate": 1e-05, |
|
"loss": 3.113, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.4416247837192057, |
|
"grad_norm": 6.8444952964782715, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0913, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.4425664143881166, |
|
"grad_norm": 7.817233562469482, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1279, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4435080450570275, |
|
"grad_norm": 6.604560375213623, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1026, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.4444496757259384, |
|
"grad_norm": 7.087409496307373, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1523, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.4453913063948493, |
|
"grad_norm": 7.123996257781982, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0798, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.44633293706376015, |
|
"grad_norm": 7.211167812347412, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1813, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.44727456773267105, |
|
"grad_norm": 7.160671710968018, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0597, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.44821619840158194, |
|
"grad_norm": 7.699546813964844, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0837, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.44915782907049284, |
|
"grad_norm": 7.012664794921875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9755, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.45009945973940374, |
|
"grad_norm": 9.324268341064453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0906, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.4510410904083146, |
|
"grad_norm": 7.166225433349609, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0489, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.4519827210772255, |
|
"grad_norm": 7.982870578765869, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1248, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4529243517461364, |
|
"grad_norm": 7.513890743255615, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1008, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.4538659824150473, |
|
"grad_norm": 6.794632911682129, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1117, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.4548076130839581, |
|
"grad_norm": 8.143996238708496, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1321, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.455749243752869, |
|
"grad_norm": 7.631880283355713, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1185, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.4566908744217799, |
|
"grad_norm": 7.6734466552734375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0592, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.4576325050906908, |
|
"grad_norm": 6.934384346008301, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1555, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.4585741357596017, |
|
"grad_norm": 7.925302505493164, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0891, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.45951576642851255, |
|
"grad_norm": 7.213337421417236, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9779, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.46045739709742345, |
|
"grad_norm": 7.504617691040039, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0613, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.46139902776633435, |
|
"grad_norm": 7.625884056091309, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9949, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.46234065843524524, |
|
"grad_norm": 7.4358696937561035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1089, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.46328228910415614, |
|
"grad_norm": 7.1684489250183105, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9965, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.464223919773067, |
|
"grad_norm": 7.672368049621582, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0439, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.4651655504419779, |
|
"grad_norm": 7.329806327819824, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0025, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.4661071811108888, |
|
"grad_norm": 7.104738235473633, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0738, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.4670488117797997, |
|
"grad_norm": 6.561854839324951, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9749, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4679904424487106, |
|
"grad_norm": 7.665432453155518, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0647, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.4689320731176214, |
|
"grad_norm": 7.229370594024658, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0231, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.4698737037865323, |
|
"grad_norm": 7.576310157775879, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0132, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.4708153344554432, |
|
"grad_norm": 6.800790786743164, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0343, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4717569651243541, |
|
"grad_norm": 7.0642266273498535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0681, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.472698595793265, |
|
"grad_norm": 6.809929370880127, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1242, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.47364022646217585, |
|
"grad_norm": 7.387331008911133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0638, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.47458185713108675, |
|
"grad_norm": 7.186639308929443, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0472, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.47552348779999765, |
|
"grad_norm": 7.330045223236084, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1256, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.47646511846890854, |
|
"grad_norm": 7.658766746520996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0334, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.47740674913781944, |
|
"grad_norm": 7.821640968322754, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1666, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.4783483798067303, |
|
"grad_norm": 8.806328773498535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1129, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4792900104756412, |
|
"grad_norm": 7.119672775268555, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0084, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.4802316411445521, |
|
"grad_norm": 7.20904541015625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0855, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.481173271813463, |
|
"grad_norm": 7.136516571044922, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9989, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.4821149024823739, |
|
"grad_norm": 6.163409233093262, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9992, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.4830565331512847, |
|
"grad_norm": 7.667194843292236, |
|
"learning_rate": 1e-05, |
|
"loss": 2.996, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.4839981638201956, |
|
"grad_norm": 7.100743293762207, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0643, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.4849397944891065, |
|
"grad_norm": 7.50702428817749, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1647, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.4858814251580174, |
|
"grad_norm": 7.247680187225342, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0211, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4868230558269283, |
|
"grad_norm": 7.403180122375488, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0319, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.48776468649583915, |
|
"grad_norm": 7.5679473876953125, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0748, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.48870631716475005, |
|
"grad_norm": 7.389653205871582, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0973, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.48964794783366095, |
|
"grad_norm": 12.070070266723633, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0398, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.49058957850257184, |
|
"grad_norm": 7.190162658691406, |
|
"learning_rate": 1e-05, |
|
"loss": 3.16, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.49153120917148274, |
|
"grad_norm": 6.753963470458984, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0305, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.4924728398403936, |
|
"grad_norm": 7.3352789878845215, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0226, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.4934144705093045, |
|
"grad_norm": 6.455969333648682, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0626, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.4943561011782154, |
|
"grad_norm": 6.789318561553955, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1405, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.4952977318471263, |
|
"grad_norm": 7.1296186447143555, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9996, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.4962393625160372, |
|
"grad_norm": 7.188891887664795, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0108, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.497180993184948, |
|
"grad_norm": 8.604632377624512, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1121, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.4981226238538589, |
|
"grad_norm": 8.063715934753418, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1481, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.4990642545227698, |
|
"grad_norm": 7.208702087402344, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1499, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5000058851916807, |
|
"grad_norm": 6.898314952850342, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0624, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.5009475158605916, |
|
"grad_norm": 7.062436103820801, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9495, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.5018891465295025, |
|
"grad_norm": 7.108069896697998, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9833, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.5028307771984134, |
|
"grad_norm": 8.10451602935791, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1446, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.5037724078673242, |
|
"grad_norm": 7.409706115722656, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1812, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.5047140385362351, |
|
"grad_norm": 7.078268527984619, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0981, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.505655669205146, |
|
"grad_norm": 6.625565052032471, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0404, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.5065972998740569, |
|
"grad_norm": 7.314438819885254, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0807, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.5075389305429678, |
|
"grad_norm": 6.884581565856934, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0895, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.5084805612118787, |
|
"grad_norm": 7.702692031860352, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0731, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5094221918807895, |
|
"grad_norm": 7.680057525634766, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0837, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.5103638225497005, |
|
"grad_norm": 7.11198616027832, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9995, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.5113054532186113, |
|
"grad_norm": 7.569001197814941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1655, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.5122470838875223, |
|
"grad_norm": 7.120611667633057, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0245, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.5131887145564331, |
|
"grad_norm": 7.674874305725098, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0993, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.514130345225344, |
|
"grad_norm": 7.820282459259033, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0897, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.5150719758942549, |
|
"grad_norm": 7.010729789733887, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9622, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.5160136065631657, |
|
"grad_norm": 7.443545818328857, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1474, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.5169552372320767, |
|
"grad_norm": 8.485078811645508, |
|
"learning_rate": 1e-05, |
|
"loss": 3.095, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.5178968679009875, |
|
"grad_norm": 7.487083435058594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1509, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5188384985698984, |
|
"grad_norm": 7.581494331359863, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0771, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.5197801292388093, |
|
"grad_norm": 7.041471481323242, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9578, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.5207217599077202, |
|
"grad_norm": 6.6093034744262695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0864, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.5216633905766311, |
|
"grad_norm": 6.595880031585693, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9654, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.522605021245542, |
|
"grad_norm": 7.0556511878967285, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9616, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.5235466519144528, |
|
"grad_norm": 7.266999244689941, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1041, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.5244882825833638, |
|
"grad_norm": 7.564467430114746, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1307, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.5254299132522746, |
|
"grad_norm": 6.620129108428955, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0366, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.5263715439211855, |
|
"grad_norm": 6.7112321853637695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0846, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.5273131745900964, |
|
"grad_norm": 8.041632652282715, |
|
"learning_rate": 1e-05, |
|
"loss": 3.161, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5282548052590073, |
|
"grad_norm": 7.796072006225586, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0004, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.5291964359279182, |
|
"grad_norm": 7.121657371520996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0108, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.530138066596829, |
|
"grad_norm": 7.073033809661865, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0985, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.5310796972657399, |
|
"grad_norm": 6.580442905426025, |
|
"learning_rate": 1e-05, |
|
"loss": 3.081, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.5320213279346508, |
|
"grad_norm": 7.59156608581543, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1838, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.5329629586035617, |
|
"grad_norm": 6.721426963806152, |
|
"learning_rate": 1e-05, |
|
"loss": 3.12, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.5339045892724726, |
|
"grad_norm": 6.488240718841553, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0373, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.5348462199413835, |
|
"grad_norm": 7.797455787658691, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0716, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5357878506102943, |
|
"grad_norm": 7.475494861602783, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9321, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.5367294812792053, |
|
"grad_norm": 7.374056339263916, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1087, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.5376711119481161, |
|
"grad_norm": 6.967336654663086, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0788, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.5386127426170271, |
|
"grad_norm": 7.319347858428955, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1399, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.5395543732859379, |
|
"grad_norm": 7.5937957763671875, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0926, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.5404960039548488, |
|
"grad_norm": 7.879610538482666, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0853, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.5414376346237597, |
|
"grad_norm": 7.16407585144043, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8899, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.5423792652926706, |
|
"grad_norm": 7.709092617034912, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1303, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5433208959615815, |
|
"grad_norm": 7.1035356521606445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1267, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.5442625266304923, |
|
"grad_norm": 6.998748302459717, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1023, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.5452041572994032, |
|
"grad_norm": 7.0011186599731445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1474, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.5461457879683141, |
|
"grad_norm": 6.540005207061768, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9699, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.547087418637225, |
|
"grad_norm": 7.625404357910156, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2087, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.5480290493061359, |
|
"grad_norm": 8.491080284118652, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0275, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.5489706799750468, |
|
"grad_norm": 7.326952934265137, |
|
"learning_rate": 1e-05, |
|
"loss": 3.119, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.5499123106439576, |
|
"grad_norm": 6.557469367980957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0491, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5508539413128686, |
|
"grad_norm": 8.412656784057617, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9752, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.5517955719817794, |
|
"grad_norm": 7.805161476135254, |
|
"learning_rate": 1e-05, |
|
"loss": 3.188, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.5527372026506904, |
|
"grad_norm": 7.706869602203369, |
|
"learning_rate": 1e-05, |
|
"loss": 3.01, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.5536788333196012, |
|
"grad_norm": 6.797297477722168, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9473, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.5546204639885121, |
|
"grad_norm": 7.4034624099731445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0252, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.555562094657423, |
|
"grad_norm": 8.384190559387207, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0013, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5565037253263339, |
|
"grad_norm": 7.6911187171936035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1842, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.5574453559952448, |
|
"grad_norm": 6.948124408721924, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9562, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5583869866641556, |
|
"grad_norm": 7.233328819274902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0687, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.5593286173330665, |
|
"grad_norm": 7.2016425132751465, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0819, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.5602702480019774, |
|
"grad_norm": 6.626002311706543, |
|
"learning_rate": 1e-05, |
|
"loss": 3.078, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.5612118786708883, |
|
"grad_norm": 7.238675117492676, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1172, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5621535093397992, |
|
"grad_norm": 6.863603115081787, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0753, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.5630951400087101, |
|
"grad_norm": 7.731668949127197, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9837, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5640367706776209, |
|
"grad_norm": 6.4692702293396, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1187, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.5649784013465319, |
|
"grad_norm": 7.253861427307129, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9771, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5649784013465319, |
|
"eval_accuracy": 0.4138860273852878, |
|
"eval_loss": 3.0637450218200684, |
|
"eval_runtime": 1049.1644, |
|
"eval_samples_per_second": 35.991, |
|
"eval_steps_per_second": 8.998, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5659200320154427, |
|
"grad_norm": 7.105886459350586, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0877, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.5668616626843537, |
|
"grad_norm": 6.724785327911377, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0216, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5678032933532645, |
|
"grad_norm": 6.603315353393555, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1276, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.5687449240221754, |
|
"grad_norm": 6.701154708862305, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9892, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5696865546910863, |
|
"grad_norm": 6.806232929229736, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0323, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.5706281853599972, |
|
"grad_norm": 7.004578113555908, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0609, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.5715698160289081, |
|
"grad_norm": 6.511326789855957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1092, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.572511446697819, |
|
"grad_norm": 7.045777320861816, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1217, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5734530773667298, |
|
"grad_norm": 6.780973434448242, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0726, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.5743947080356407, |
|
"grad_norm": 7.139321804046631, |
|
"learning_rate": 1e-05, |
|
"loss": 3.065, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5753363387045516, |
|
"grad_norm": 7.129504203796387, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1518, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.5762779693734625, |
|
"grad_norm": 8.042350769042969, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0006, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.5772196000423734, |
|
"grad_norm": 6.995969772338867, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0849, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.5781612307112842, |
|
"grad_norm": 6.369194030761719, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9541, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.5791028613801952, |
|
"grad_norm": 6.521533012390137, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0632, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.580044492049106, |
|
"grad_norm": 7.185681343078613, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1407, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.580986122718017, |
|
"grad_norm": 6.984232425689697, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1047, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.5819277533869278, |
|
"grad_norm": 7.491443634033203, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0165, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.5828693840558387, |
|
"grad_norm": 6.652406215667725, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1079, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.5838110147247496, |
|
"grad_norm": 7.801830291748047, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1699, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5847526453936605, |
|
"grad_norm": 7.229469299316406, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0419, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.5856942760625714, |
|
"grad_norm": 6.888019561767578, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9814, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.5866359067314822, |
|
"grad_norm": 7.353707790374756, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0864, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.5875775374003931, |
|
"grad_norm": 7.126093864440918, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1298, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.588519168069304, |
|
"grad_norm": 7.8923211097717285, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0013, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.5894607987382149, |
|
"grad_norm": 6.3733649253845215, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9838, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.5904024294071258, |
|
"grad_norm": 7.295039176940918, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1155, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.5913440600760367, |
|
"grad_norm": 7.227429389953613, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1303, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.5922856907449475, |
|
"grad_norm": 7.039018630981445, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9929, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.5932273214138585, |
|
"grad_norm": 8.034615516662598, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0979, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5941689520827693, |
|
"grad_norm": 7.214796543121338, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0098, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.5951105827516803, |
|
"grad_norm": 7.179065227508545, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0168, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5960522134205911, |
|
"grad_norm": 7.219829559326172, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9791, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.596993844089502, |
|
"grad_norm": 7.285015106201172, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9036, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.5979354747584129, |
|
"grad_norm": 8.418420791625977, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9316, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.5988771054273238, |
|
"grad_norm": 7.653911590576172, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0915, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5998187360962347, |
|
"grad_norm": 6.934592247009277, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9547, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.6007603667651455, |
|
"grad_norm": 6.826767444610596, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9792, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.6017019974340564, |
|
"grad_norm": 7.79368782043457, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0121, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.6026436281029673, |
|
"grad_norm": 7.679154872894287, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1288, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6035852587718782, |
|
"grad_norm": 7.1232781410217285, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0423, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.6045268894407891, |
|
"grad_norm": 7.5240983963012695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.056, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.6054685201097, |
|
"grad_norm": 7.4613237380981445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0777, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.6064101507786108, |
|
"grad_norm": 6.664481163024902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0872, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.6073517814475218, |
|
"grad_norm": 6.648809432983398, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1206, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.6082934121164326, |
|
"grad_norm": 6.968840599060059, |
|
"learning_rate": 1e-05, |
|
"loss": 3.052, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.6092350427853436, |
|
"grad_norm": 7.363081932067871, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1386, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.6101766734542544, |
|
"grad_norm": 6.684266567230225, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9426, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.6111183041231653, |
|
"grad_norm": 7.6497650146484375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9756, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.6120599347920762, |
|
"grad_norm": 7.052670955657959, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0383, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.613001565460987, |
|
"grad_norm": 6.5792083740234375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8681, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.613943196129898, |
|
"grad_norm": 7.075092315673828, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0287, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.6148848267988088, |
|
"grad_norm": 7.760746002197266, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0866, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.6158264574677197, |
|
"grad_norm": 7.239394664764404, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9918, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.6167680881366306, |
|
"grad_norm": 7.3641276359558105, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9678, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.6177097188055415, |
|
"grad_norm": 10.26201057434082, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1028, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.6186513494744523, |
|
"grad_norm": 6.868218421936035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0633, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.6195929801433633, |
|
"grad_norm": 6.771929740905762, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0126, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.6205346108122741, |
|
"grad_norm": 7.173276901245117, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0549, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.6214762414811851, |
|
"grad_norm": 6.571986198425293, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0469, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6224178721500959, |
|
"grad_norm": 8.061853408813477, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0797, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.6233595028190068, |
|
"grad_norm": 6.856041431427002, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0835, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.6243011334879177, |
|
"grad_norm": 6.356860637664795, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0321, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.6252427641568286, |
|
"grad_norm": 7.281395435333252, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9796, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.6261843948257395, |
|
"grad_norm": 7.561255931854248, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0791, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.6271260254946504, |
|
"grad_norm": 5.9859185218811035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0435, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.6280676561635612, |
|
"grad_norm": 7.344315528869629, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0616, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.6290092868324721, |
|
"grad_norm": 7.146785736083984, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0322, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.629950917501383, |
|
"grad_norm": 7.084141731262207, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0367, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.6308925481702939, |
|
"grad_norm": 7.505505084991455, |
|
"learning_rate": 1e-05, |
|
"loss": 3.052, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6318341788392048, |
|
"grad_norm": 7.202043533325195, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0902, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.6327758095081156, |
|
"grad_norm": 6.772159099578857, |
|
"learning_rate": 1e-05, |
|
"loss": 3.015, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.6337174401770266, |
|
"grad_norm": 6.495903968811035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0254, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.6346590708459374, |
|
"grad_norm": 7.2963457107543945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0161, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.6356007015148484, |
|
"grad_norm": 7.240870475769043, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0329, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.6365423321837592, |
|
"grad_norm": 6.670111656188965, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9956, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.6374839628526701, |
|
"grad_norm": 7.213159561157227, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9224, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.638425593521581, |
|
"grad_norm": 7.563082218170166, |
|
"learning_rate": 1e-05, |
|
"loss": 3.14, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.6393672241904919, |
|
"grad_norm": 7.692990779876709, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9937, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.6403088548594028, |
|
"grad_norm": 7.73374605178833, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0108, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6412504855283137, |
|
"grad_norm": 6.7092976570129395, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0887, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.6421921161972245, |
|
"grad_norm": 7.405087471008301, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9877, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.6431337468661354, |
|
"grad_norm": 7.017783164978027, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0624, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.6440753775350463, |
|
"grad_norm": 6.923324108123779, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9223, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.6450170082039572, |
|
"grad_norm": 7.183239936828613, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1286, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.6459586388728681, |
|
"grad_norm": 7.303028106689453, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0108, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.6469002695417789, |
|
"grad_norm": 6.967830181121826, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0991, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.6478419002106899, |
|
"grad_norm": 7.331003189086914, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1454, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6487835308796007, |
|
"grad_norm": 7.24896764755249, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9953, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.6497251615485117, |
|
"grad_norm": 8.16205883026123, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9901, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6506667922174225, |
|
"grad_norm": 7.7972412109375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1011, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.6516084228863334, |
|
"grad_norm": 6.8451714515686035, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9883, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.6525500535552443, |
|
"grad_norm": 6.7832465171813965, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9701, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.6534916842241552, |
|
"grad_norm": 7.421736717224121, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0205, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.6544333148930661, |
|
"grad_norm": 7.19790506362915, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9935, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.655374945561977, |
|
"grad_norm": 7.348209857940674, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8969, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6563165762308878, |
|
"grad_norm": 7.604790687561035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0523, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.6572582068997987, |
|
"grad_norm": 6.750668525695801, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0571, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.6581998375687096, |
|
"grad_norm": 6.694459915161133, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0252, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.6591414682376205, |
|
"grad_norm": 7.250037670135498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0203, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6600830989065314, |
|
"grad_norm": 6.481886863708496, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1012, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.6610247295754422, |
|
"grad_norm": 7.44242525100708, |
|
"learning_rate": 1e-05, |
|
"loss": 3.031, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.6619663602443532, |
|
"grad_norm": 6.313452243804932, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0551, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.662907990913264, |
|
"grad_norm": 7.033621788024902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0459, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.663849621582175, |
|
"grad_norm": 6.128735065460205, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9017, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.6647912522510858, |
|
"grad_norm": 6.252425670623779, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0382, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.6657328829199967, |
|
"grad_norm": 6.670257091522217, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1805, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.6666745135889076, |
|
"grad_norm": 6.33909797668457, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0568, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.6676161442578185, |
|
"grad_norm": 7.0296630859375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0804, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.6685577749267294, |
|
"grad_norm": 6.852359294891357, |
|
"learning_rate": 1e-05, |
|
"loss": 3.003, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.6694994055956403, |
|
"grad_norm": 7.319458961486816, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9733, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.6704410362645511, |
|
"grad_norm": 7.213205337524414, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0515, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.671382666933462, |
|
"grad_norm": 6.743834972381592, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8611, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.6723242976023729, |
|
"grad_norm": 7.25566291809082, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0633, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.6732659282712838, |
|
"grad_norm": 6.3295578956604, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0196, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.6742075589401947, |
|
"grad_norm": 7.30181884765625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.052, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.6751491896091055, |
|
"grad_norm": 6.492465496063232, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9969, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.6760908202780165, |
|
"grad_norm": 7.620621204376221, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0297, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.6770324509469273, |
|
"grad_norm": 6.799111843109131, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9475, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.6779740816158383, |
|
"grad_norm": 7.955085277557373, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9165, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6789157122847491, |
|
"grad_norm": 6.649621486663818, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0665, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.67985734295366, |
|
"grad_norm": 7.301611423492432, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0468, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.6807989736225709, |
|
"grad_norm": 6.720338821411133, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9307, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.6817406042914818, |
|
"grad_norm": 6.150376796722412, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0759, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.6826822349603927, |
|
"grad_norm": 6.534663200378418, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0509, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.6836238656293036, |
|
"grad_norm": 6.672440052032471, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9175, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.6845654962982144, |
|
"grad_norm": 6.849086761474609, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1881, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.6855071269671253, |
|
"grad_norm": 7.532481670379639, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0881, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6864487576360362, |
|
"grad_norm": 7.62110710144043, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0303, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.6873903883049471, |
|
"grad_norm": 6.990483283996582, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1092, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.688332018973858, |
|
"grad_norm": 7.549264907836914, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0515, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.6892736496427688, |
|
"grad_norm": 7.345057010650635, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0485, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.6902152803116798, |
|
"grad_norm": 7.437522888183594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0337, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.6911569109805906, |
|
"grad_norm": 6.142207145690918, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9561, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.6920985416495016, |
|
"grad_norm": 6.626426696777344, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1157, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.6930401723184124, |
|
"grad_norm": 6.62837553024292, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0824, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6939818029873233, |
|
"grad_norm": 7.575382232666016, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0039, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.6949234336562342, |
|
"grad_norm": 7.333104133605957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0786, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.695865064325145, |
|
"grad_norm": 7.4377923011779785, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9884, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.696806694994056, |
|
"grad_norm": 6.842844486236572, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9574, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6977483256629669, |
|
"grad_norm": 6.345764636993408, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0264, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.6986899563318777, |
|
"grad_norm": 7.15539026260376, |
|
"learning_rate": 1e-05, |
|
"loss": 3.059, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.6996315870007886, |
|
"grad_norm": 7.269350528717041, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0736, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.7005732176696995, |
|
"grad_norm": 6.788912296295166, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9954, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.7015148483386104, |
|
"grad_norm": 6.4122185707092285, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9234, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.7024564790075213, |
|
"grad_norm": 7.263458251953125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9974, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.7033981096764321, |
|
"grad_norm": 6.88037633895874, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0908, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.7043397403453431, |
|
"grad_norm": 7.234920978546143, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0355, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.7052813710142539, |
|
"grad_norm": 7.346100807189941, |
|
"learning_rate": 1e-05, |
|
"loss": 2.938, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.7062230016831649, |
|
"grad_norm": 6.825830936431885, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1014, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.7071646323520757, |
|
"grad_norm": 7.167881965637207, |
|
"learning_rate": 1e-05, |
|
"loss": 3.064, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.7081062630209866, |
|
"grad_norm": 6.6397294998168945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0352, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.7090478936898975, |
|
"grad_norm": 7.553011894226074, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0674, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.7099895243588084, |
|
"grad_norm": 7.612958908081055, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0081, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.7109311550277193, |
|
"grad_norm": 6.634482383728027, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0408, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.7118727856966302, |
|
"grad_norm": 6.5582356452941895, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0377, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.712814416365541, |
|
"grad_norm": 6.804215431213379, |
|
"learning_rate": 1e-05, |
|
"loss": 3.001, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.713756047034452, |
|
"grad_norm": 7.458028316497803, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9582, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.7146976777033628, |
|
"grad_norm": 6.639705181121826, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0887, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.7156393083722736, |
|
"grad_norm": 6.617265701293945, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0967, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7165809390411846, |
|
"grad_norm": 8.8301362991333, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0354, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.7175225697100954, |
|
"grad_norm": 6.7238264083862305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0144, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.7184642003790064, |
|
"grad_norm": 6.8456926345825195, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9888, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.7194058310479172, |
|
"grad_norm": 8.741981506347656, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1182, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.7203474617168281, |
|
"grad_norm": 6.671243667602539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0422, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.721289092385739, |
|
"grad_norm": 7.100765228271484, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0205, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.7222307230546499, |
|
"grad_norm": 7.116576671600342, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0333, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.7231723537235608, |
|
"grad_norm": 6.72273588180542, |
|
"learning_rate": 1e-05, |
|
"loss": 3.081, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.7241139843924717, |
|
"grad_norm": 6.83878231048584, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9635, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.7250556150613825, |
|
"grad_norm": 7.474968433380127, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0792, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.7259972457302935, |
|
"grad_norm": 7.929852485656738, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9793, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.7269388763992043, |
|
"grad_norm": 6.638747215270996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0311, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.7278805070681152, |
|
"grad_norm": 6.270215034484863, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9846, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.7288221377370261, |
|
"grad_norm": 7.199500560760498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0651, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.7297637684059369, |
|
"grad_norm": 6.566064834594727, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9806, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.7307053990748479, |
|
"grad_norm": 7.1128830909729, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0375, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.7316470297437587, |
|
"grad_norm": 6.685820579528809, |
|
"learning_rate": 1e-05, |
|
"loss": 3.058, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.7325886604126697, |
|
"grad_norm": 6.155799388885498, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0514, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.7335302910815805, |
|
"grad_norm": 6.671477794647217, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9818, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.7344719217504914, |
|
"grad_norm": 7.42489767074585, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1053, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7354135524194023, |
|
"grad_norm": 6.796725749969482, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9821, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.7363551830883132, |
|
"grad_norm": 7.848191261291504, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9765, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.7372968137572241, |
|
"grad_norm": 6.922936916351318, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1376, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.738238444426135, |
|
"grad_norm": 6.367747783660889, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1063, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.7391800750950458, |
|
"grad_norm": 6.697858810424805, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9815, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.7401217057639567, |
|
"grad_norm": 6.511905193328857, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9989, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.7410633364328676, |
|
"grad_norm": 7.045902729034424, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0436, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.7420049671017785, |
|
"grad_norm": 6.697847843170166, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9933, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.7429465977706894, |
|
"grad_norm": 8.465171813964844, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1475, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.7438882284396002, |
|
"grad_norm": 7.368391990661621, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9639, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7448298591085112, |
|
"grad_norm": 9.309432983398438, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0413, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.745771489777422, |
|
"grad_norm": 7.960738658905029, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9888, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.746713120446333, |
|
"grad_norm": 7.475559711456299, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0662, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.7476547511152438, |
|
"grad_norm": 7.966568946838379, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0452, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.7485963817841547, |
|
"grad_norm": 7.25874137878418, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9464, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.7495380124530656, |
|
"grad_norm": 7.146042346954346, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0599, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.7504796431219765, |
|
"grad_norm": 7.571315765380859, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0561, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.7514212737908874, |
|
"grad_norm": 7.106105327606201, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0022, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.7523629044597983, |
|
"grad_norm": 7.547785758972168, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1197, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.7533045351287091, |
|
"grad_norm": 6.921874046325684, |
|
"learning_rate": 1e-05, |
|
"loss": 2.946, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7533045351287091, |
|
"eval_accuracy": 0.4179943871149811, |
|
"eval_loss": 3.0350847244262695, |
|
"eval_runtime": 1060.3761, |
|
"eval_samples_per_second": 35.61, |
|
"eval_steps_per_second": 8.903, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.75424616579762, |
|
"grad_norm": 7.115399360656738, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9728, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.7551877964665309, |
|
"grad_norm": 6.1880269050598145, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0188, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.7561294271354418, |
|
"grad_norm": 6.350123405456543, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9826, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.7570710578043527, |
|
"grad_norm": 6.536036968231201, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0136, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.7580126884732635, |
|
"grad_norm": 6.593567848205566, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9847, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.7589543191421745, |
|
"grad_norm": 6.684474468231201, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1012, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.7598959498110853, |
|
"grad_norm": 6.77077579498291, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0014, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.7608375804799963, |
|
"grad_norm": 7.389578819274902, |
|
"learning_rate": 1e-05, |
|
"loss": 3.014, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7617792111489071, |
|
"grad_norm": 6.702242374420166, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9964, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.762720841817818, |
|
"grad_norm": 7.013911247253418, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0237, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7636624724867289, |
|
"grad_norm": 6.621683597564697, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9814, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.7646041031556398, |
|
"grad_norm": 6.537116050720215, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0512, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.7655457338245507, |
|
"grad_norm": 7.781442165374756, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0855, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.7664873644934616, |
|
"grad_norm": 7.431787967681885, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9926, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.7674289951623724, |
|
"grad_norm": 7.550678730010986, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9967, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.7683706258312833, |
|
"grad_norm": 7.120694637298584, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0271, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.7693122565001942, |
|
"grad_norm": 6.2324748039245605, |
|
"learning_rate": 1e-05, |
|
"loss": 2.979, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.7702538871691051, |
|
"grad_norm": 7.045401573181152, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0638, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.771195517838016, |
|
"grad_norm": 7.148534774780273, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9982, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.7721371485069268, |
|
"grad_norm": 6.389626979827881, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9228, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7730787791758378, |
|
"grad_norm": 7.732367992401123, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0264, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.7740204098447486, |
|
"grad_norm": 6.456219673156738, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1302, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.7749620405136596, |
|
"grad_norm": 6.384737014770508, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9592, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.7759036711825704, |
|
"grad_norm": 7.74468469619751, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0449, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.7768453018514813, |
|
"grad_norm": 6.250748634338379, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0414, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.7777869325203922, |
|
"grad_norm": 6.97851037979126, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0961, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.7787285631893031, |
|
"grad_norm": 7.248284816741943, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9958, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.779670193858214, |
|
"grad_norm": 6.732598304748535, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9861, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.7806118245271249, |
|
"grad_norm": 6.4773173332214355, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9421, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.7815534551960357, |
|
"grad_norm": 6.1240620613098145, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9885, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.7824950858649466, |
|
"grad_norm": 8.18076229095459, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8914, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.7834367165338575, |
|
"grad_norm": 6.479029178619385, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9848, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.7843783472027684, |
|
"grad_norm": 7.001804828643799, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9447, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.7853199778716793, |
|
"grad_norm": 7.894457817077637, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9969, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.7862616085405901, |
|
"grad_norm": 6.224025726318359, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1038, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.7872032392095011, |
|
"grad_norm": 7.672967910766602, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1315, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.7881448698784119, |
|
"grad_norm": 6.595861434936523, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0766, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.7890865005473229, |
|
"grad_norm": 6.691225051879883, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0528, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.7900281312162337, |
|
"grad_norm": 6.80654764175415, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0533, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.7909697618851446, |
|
"grad_norm": 7.244470119476318, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0414, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7919113925540555, |
|
"grad_norm": 6.781643390655518, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0211, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.7928530232229664, |
|
"grad_norm": 6.594189167022705, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0595, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.7937946538918773, |
|
"grad_norm": 6.970313549041748, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0505, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.7947362845607882, |
|
"grad_norm": 7.164793491363525, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9477, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.795677915229699, |
|
"grad_norm": 6.909482955932617, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9963, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.79661954589861, |
|
"grad_norm": 6.645529747009277, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0773, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.7975611765675208, |
|
"grad_norm": 6.645266532897949, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0217, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.7985028072364317, |
|
"grad_norm": 6.193962574005127, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9698, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.7994444379053426, |
|
"grad_norm": 6.405771255493164, |
|
"learning_rate": 1e-05, |
|
"loss": 3.012, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.8003860685742534, |
|
"grad_norm": 6.351288795471191, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9623, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.8013276992431644, |
|
"grad_norm": 6.759338855743408, |
|
"learning_rate": 1e-05, |
|
"loss": 3.004, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.8022693299120752, |
|
"grad_norm": 6.525925636291504, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0707, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.8032109605809862, |
|
"grad_norm": 7.245208263397217, |
|
"learning_rate": 1e-05, |
|
"loss": 3.033, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.804152591249897, |
|
"grad_norm": 7.056334972381592, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9285, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.8050942219188079, |
|
"grad_norm": 7.097318172454834, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9806, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.8060358525877188, |
|
"grad_norm": 6.930456638336182, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0609, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.8069774832566297, |
|
"grad_norm": 6.811648368835449, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1415, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.8079191139255405, |
|
"grad_norm": 6.777252197265625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0246, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.8088607445944515, |
|
"grad_norm": 6.139156818389893, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9366, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.8098023752633623, |
|
"grad_norm": 6.63328742980957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0745, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8107440059322732, |
|
"grad_norm": 7.7045183181762695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0592, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.8116856366011841, |
|
"grad_norm": 6.513607978820801, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9483, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.8126272672700949, |
|
"grad_norm": 6.670443534851074, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0345, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.8135688979390059, |
|
"grad_norm": 6.660811901092529, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9852, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.8145105286079167, |
|
"grad_norm": 7.2338361740112305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0945, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.8154521592768277, |
|
"grad_norm": 6.750393390655518, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9639, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.8163937899457385, |
|
"grad_norm": 7.605077743530273, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0707, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.8173354206146494, |
|
"grad_norm": 7.224457263946533, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0978, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.8182770512835603, |
|
"grad_norm": 6.255330562591553, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0075, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.8192186819524712, |
|
"grad_norm": 6.431050777435303, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1404, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.8201603126213821, |
|
"grad_norm": 7.618997097015381, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8578, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.821101943290293, |
|
"grad_norm": 6.874431133270264, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9544, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.8220435739592038, |
|
"grad_norm": 6.51139497756958, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0135, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.8229852046281148, |
|
"grad_norm": 6.711349964141846, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0015, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.8239268352970256, |
|
"grad_norm": 7.047435283660889, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0321, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.8248684659659365, |
|
"grad_norm": 7.979480266571045, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1258, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.8258100966348474, |
|
"grad_norm": 6.74650239944458, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1225, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.8267517273037582, |
|
"grad_norm": 5.72318696975708, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9543, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.8276933579726692, |
|
"grad_norm": 6.581451416015625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9904, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.82863498864158, |
|
"grad_norm": 6.8295769691467285, |
|
"learning_rate": 1e-05, |
|
"loss": 2.994, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.829576619310491, |
|
"grad_norm": 7.150579929351807, |
|
"learning_rate": 1e-05, |
|
"loss": 3.024, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 0.8305182499794018, |
|
"grad_norm": 6.949995994567871, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0905, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.8314598806483127, |
|
"grad_norm": 7.133937835693359, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0603, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 0.8324015113172236, |
|
"grad_norm": 6.495321750640869, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0027, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.8333431419861345, |
|
"grad_norm": 6.963568210601807, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9662, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.8342847726550454, |
|
"grad_norm": 7.362428188323975, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9869, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.8352264033239563, |
|
"grad_norm": 6.788746356964111, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0145, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 0.8361680339928671, |
|
"grad_norm": 7.730389595031738, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1651, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.837109664661778, |
|
"grad_norm": 7.5514116287231445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0095, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.8380512953306889, |
|
"grad_norm": 6.9045257568359375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0569, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8389929259995998, |
|
"grad_norm": 7.573888778686523, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9869, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.8399345566685107, |
|
"grad_norm": 7.188941478729248, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0701, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.8408761873374215, |
|
"grad_norm": 6.4194817543029785, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 0.8418178180063325, |
|
"grad_norm": 6.900866508483887, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9665, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.8427594486752433, |
|
"grad_norm": 7.416345596313477, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0122, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.8437010793441543, |
|
"grad_norm": 7.08629846572876, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1027, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.8446427100130651, |
|
"grad_norm": 6.6842803955078125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9872, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.845584340681976, |
|
"grad_norm": 7.190927982330322, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9336, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.8465259713508869, |
|
"grad_norm": 7.118557929992676, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9181, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 0.8474676020197978, |
|
"grad_norm": 6.6827006340026855, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0798, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8484092326887087, |
|
"grad_norm": 6.979405403137207, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9763, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 0.8493508633576196, |
|
"grad_norm": 6.2492780685424805, |
|
"learning_rate": 1e-05, |
|
"loss": 3.021, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.8502924940265304, |
|
"grad_norm": 7.063942909240723, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9505, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.8512341246954414, |
|
"grad_norm": 6.5366716384887695, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8787, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8521757553643522, |
|
"grad_norm": 6.5581769943237305, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1491, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.8531173860332631, |
|
"grad_norm": 6.441840171813965, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0369, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.854059016702174, |
|
"grad_norm": 7.320792198181152, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0956, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 0.8550006473710848, |
|
"grad_norm": 7.050429821014404, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0892, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.8559422780399958, |
|
"grad_norm": 6.61350679397583, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9785, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.8568839087089066, |
|
"grad_norm": 6.520270347595215, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0025, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8578255393778176, |
|
"grad_norm": 6.5053791999816895, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8551, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 0.8587671700467284, |
|
"grad_norm": 6.498419761657715, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9634, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.8597088007156393, |
|
"grad_norm": 7.01683235168457, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1556, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 0.8606504313845502, |
|
"grad_norm": 6.802071571350098, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0464, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.8615920620534611, |
|
"grad_norm": 6.664157390594482, |
|
"learning_rate": 1e-05, |
|
"loss": 3.012, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.862533692722372, |
|
"grad_norm": 6.3640456199646, |
|
"learning_rate": 1e-05, |
|
"loss": 2.934, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.8634753233912829, |
|
"grad_norm": 6.966064929962158, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9922, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.8644169540601937, |
|
"grad_norm": 6.394281387329102, |
|
"learning_rate": 1e-05, |
|
"loss": 2.904, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.8653585847291047, |
|
"grad_norm": 8.291240692138672, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0824, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 0.8663002153980155, |
|
"grad_norm": 6.770839214324951, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0608, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8672418460669264, |
|
"grad_norm": 7.17354679107666, |
|
"learning_rate": 1e-05, |
|
"loss": 3.007, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.8681834767358373, |
|
"grad_norm": 6.194829940795898, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9562, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.8691251074047481, |
|
"grad_norm": 6.281735420227051, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9716, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 0.8700667380736591, |
|
"grad_norm": 7.197927951812744, |
|
"learning_rate": 1e-05, |
|
"loss": 3.008, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.8710083687425699, |
|
"grad_norm": 6.667356967926025, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1845, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.8719499994114809, |
|
"grad_norm": 7.937153339385986, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0967, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.8728916300803917, |
|
"grad_norm": 6.7486252784729, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0172, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.8738332607493026, |
|
"grad_norm": 7.023075103759766, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0151, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.8747748914182135, |
|
"grad_norm": 6.361885070800781, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0446, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 0.8757165220871244, |
|
"grad_norm": 6.498104572296143, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0111, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.8766581527560353, |
|
"grad_norm": 6.258391380310059, |
|
"learning_rate": 1e-05, |
|
"loss": 3.006, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.8775997834249462, |
|
"grad_norm": 7.305341720581055, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9864, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.878541414093857, |
|
"grad_norm": 6.587819576263428, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9275, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.879483044762768, |
|
"grad_norm": 6.967113018035889, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0478, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.8804246754316788, |
|
"grad_norm": 6.257725715637207, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9357, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.8813663061005897, |
|
"grad_norm": 7.08759069442749, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9292, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.8823079367695006, |
|
"grad_norm": 6.993216514587402, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9556, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 0.8832495674384114, |
|
"grad_norm": 6.2222771644592285, |
|
"learning_rate": 1e-05, |
|
"loss": 2.95, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.8841911981073224, |
|
"grad_norm": 5.418234825134277, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9276, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.8851328287762332, |
|
"grad_norm": 7.821156024932861, |
|
"learning_rate": 1e-05, |
|
"loss": 2.957, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8860744594451442, |
|
"grad_norm": 7.50763463973999, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0653, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 0.887016090114055, |
|
"grad_norm": 5.917377471923828, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0135, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.8879577207829659, |
|
"grad_norm": 6.086848258972168, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9784, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.8888993514518768, |
|
"grad_norm": 7.318466663360596, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9473, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.8898409821207877, |
|
"grad_norm": 6.821482181549072, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0772, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.8907826127896986, |
|
"grad_norm": 6.979645252227783, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0302, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.8917242434586095, |
|
"grad_norm": 7.538966655731201, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0771, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 0.8926658741275203, |
|
"grad_norm": 6.447031497955322, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0053, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.8936075047964313, |
|
"grad_norm": 6.903289794921875, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0288, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 0.8945491354653421, |
|
"grad_norm": 6.24967098236084, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0103, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.895490766134253, |
|
"grad_norm": 6.205604553222656, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0107, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.8964323968031639, |
|
"grad_norm": 6.532459735870361, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0275, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8973740274720747, |
|
"grad_norm": 6.63011360168457, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9595, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 0.8983156581409857, |
|
"grad_norm": 6.74818229675293, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0929, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.8992572888098965, |
|
"grad_norm": 6.410697937011719, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1027, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.9001989194788075, |
|
"grad_norm": 6.183323860168457, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9818, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.9011405501477183, |
|
"grad_norm": 6.527472496032715, |
|
"learning_rate": 1e-05, |
|
"loss": 2.996, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.9020821808166292, |
|
"grad_norm": 6.853177547454834, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0178, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.9030238114855401, |
|
"grad_norm": 7.206171035766602, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0277, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.903965442154451, |
|
"grad_norm": 7.369021892547607, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9376, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9049070728233618, |
|
"grad_norm": 7.516411781311035, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0268, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 0.9058487034922728, |
|
"grad_norm": 6.495131015777588, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0405, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.9067903341611836, |
|
"grad_norm": 7.050095558166504, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1039, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.9077319648300946, |
|
"grad_norm": 6.463887691497803, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0336, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.9086735954990054, |
|
"grad_norm": 6.821866512298584, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1019, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.9096152261679162, |
|
"grad_norm": 9.358695983886719, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9516, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.9105568568368272, |
|
"grad_norm": 7.6679863929748535, |
|
"learning_rate": 1e-05, |
|
"loss": 2.979, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 0.911498487505738, |
|
"grad_norm": 6.348465919494629, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0011, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.912440118174649, |
|
"grad_norm": 6.158372402191162, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8976, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.9133817488435598, |
|
"grad_norm": 6.827818870544434, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0961, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9143233795124707, |
|
"grad_norm": 6.508996486663818, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9493, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 0.9152650101813816, |
|
"grad_norm": 6.057750701904297, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9642, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.9162066408502925, |
|
"grad_norm": 6.342018127441406, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9965, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.9171482715192034, |
|
"grad_norm": 7.247106552124023, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0096, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.9180899021881143, |
|
"grad_norm": 7.869892120361328, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0186, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.9190315328570251, |
|
"grad_norm": 6.655325412750244, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9532, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.919973163525936, |
|
"grad_norm": 7.238323211669922, |
|
"learning_rate": 1e-05, |
|
"loss": 2.964, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 0.9209147941948469, |
|
"grad_norm": 6.327097415924072, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9265, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.9218564248637579, |
|
"grad_norm": 6.87644624710083, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9748, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 0.9227980555326687, |
|
"grad_norm": 7.045034408569336, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9084, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9237396862015795, |
|
"grad_norm": 7.708536624908447, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0383, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.9246813168704905, |
|
"grad_norm": 6.771285057067871, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9808, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.9256229475394013, |
|
"grad_norm": 6.201882362365723, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0511, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 0.9265645782083123, |
|
"grad_norm": 6.08769416809082, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0086, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.9275062088772231, |
|
"grad_norm": 6.0023393630981445, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0951, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.928447839546134, |
|
"grad_norm": 6.94964075088501, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0002, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.9293894702150449, |
|
"grad_norm": 6.350240707397461, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9763, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.9303311008839558, |
|
"grad_norm": 7.173480033874512, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9946, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.9312727315528667, |
|
"grad_norm": 6.645884990692139, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0732, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.9322143622217776, |
|
"grad_norm": 6.9946370124816895, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0506, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9331559928906884, |
|
"grad_norm": 6.4223456382751465, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9862, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 0.9340976235595994, |
|
"grad_norm": 6.558778762817383, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9428, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.9350392542285102, |
|
"grad_norm": 7.190298080444336, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0667, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.9359808848974212, |
|
"grad_norm": 6.2120184898376465, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9411, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.936922515566332, |
|
"grad_norm": 7.766112804412842, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0971, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.9378641462352428, |
|
"grad_norm": 7.066897392272949, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1249, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.9388057769041538, |
|
"grad_norm": 6.099008560180664, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0034, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 0.9397474075730646, |
|
"grad_norm": 6.696260452270508, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0705, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.9406890382419756, |
|
"grad_norm": 6.140739440917969, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9683, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.9416306689108864, |
|
"grad_norm": 7.1208600997924805, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9827, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9416306689108864, |
|
"eval_accuracy": 0.4198751663662041, |
|
"eval_loss": 3.008730888366699, |
|
"eval_runtime": 1038.4567, |
|
"eval_samples_per_second": 36.362, |
|
"eval_steps_per_second": 9.09, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9425722995797973, |
|
"grad_norm": 6.645845890045166, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9817, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 0.9435139302487082, |
|
"grad_norm": 6.95515251159668, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0721, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.9444555609176191, |
|
"grad_norm": 6.197011947631836, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0179, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 0.94539719158653, |
|
"grad_norm": 6.877534866333008, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0148, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.9463388222554409, |
|
"grad_norm": 6.991530418395996, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0165, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.9472804529243517, |
|
"grad_norm": 7.398330211639404, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9934, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.9482220835932627, |
|
"grad_norm": 7.08344030380249, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0171, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 0.9491637142621735, |
|
"grad_norm": 6.782553195953369, |
|
"learning_rate": 1e-05, |
|
"loss": 2.932, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9501053449310845, |
|
"grad_norm": 6.938860893249512, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9003, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 0.9510469755999953, |
|
"grad_norm": 6.443044185638428, |
|
"learning_rate": 1e-05, |
|
"loss": 3.066, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9519886062689061, |
|
"grad_norm": 7.133784770965576, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0384, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 0.9529302369378171, |
|
"grad_norm": 7.460980415344238, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1658, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.9538718676067279, |
|
"grad_norm": 6.758791923522949, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0877, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 0.9548134982756389, |
|
"grad_norm": 6.621381759643555, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9102, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.9557551289445497, |
|
"grad_norm": 7.349249362945557, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9883, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.9566967596134606, |
|
"grad_norm": 7.027257919311523, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0714, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9576383902823715, |
|
"grad_norm": 7.285141468048096, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9703, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 0.9585800209512824, |
|
"grad_norm": 6.913536548614502, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0111, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.9595216516201933, |
|
"grad_norm": 6.543398857116699, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0237, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 0.9604632822891042, |
|
"grad_norm": 6.569692611694336, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9803, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.961404912958015, |
|
"grad_norm": 6.915118217468262, |
|
"learning_rate": 1e-05, |
|
"loss": 2.941, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 0.962346543626926, |
|
"grad_norm": 7.568630695343018, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1096, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.9632881742958368, |
|
"grad_norm": 6.865966320037842, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0179, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 0.9642298049647477, |
|
"grad_norm": 6.584255695343018, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1118, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.9651714356336586, |
|
"grad_norm": 6.9417314529418945, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9616, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.9661130663025694, |
|
"grad_norm": 7.483455181121826, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8741, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.9670546969714804, |
|
"grad_norm": 6.249847412109375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0775, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 0.9679963276403912, |
|
"grad_norm": 7.1288018226623535, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0476, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.9689379583093022, |
|
"grad_norm": 6.932041168212891, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0245, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 0.969879588978213, |
|
"grad_norm": 7.062252044677734, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9943, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.9708212196471239, |
|
"grad_norm": 7.1132121086120605, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0385, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 0.9717628503160348, |
|
"grad_norm": 6.818399429321289, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0848, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.9727044809849457, |
|
"grad_norm": 6.7855610847473145, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0152, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 0.9736461116538566, |
|
"grad_norm": 6.324885368347168, |
|
"learning_rate": 1e-05, |
|
"loss": 2.91, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.9745877423227675, |
|
"grad_norm": 6.335012435913086, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0045, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.9755293729916783, |
|
"grad_norm": 6.50994873046875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9847, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.9764710036605893, |
|
"grad_norm": 6.5178680419921875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9757, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 0.9774126343295001, |
|
"grad_norm": 7.039647102355957, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9804, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.978354264998411, |
|
"grad_norm": 7.136976718902588, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9841, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 0.9792958956673219, |
|
"grad_norm": 6.711559772491455, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1176, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9802375263362327, |
|
"grad_norm": 6.82883882522583, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9664, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 0.9811791570051437, |
|
"grad_norm": 7.072768688201904, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9684, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.9821207876740545, |
|
"grad_norm": 6.364959716796875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8905, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 0.9830624183429655, |
|
"grad_norm": 6.766102313995361, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0164, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.9840040490118763, |
|
"grad_norm": 8.253533363342285, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9722, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.9849456796807872, |
|
"grad_norm": 7.092304229736328, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9922, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.9858873103496981, |
|
"grad_norm": 6.579424858093262, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9465, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 0.986828941018609, |
|
"grad_norm": 7.015688896179199, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9417, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.9877705716875199, |
|
"grad_norm": 7.938333511352539, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0678, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 0.9887122023564308, |
|
"grad_norm": 6.95517635345459, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9374, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.9896538330253416, |
|
"grad_norm": 6.90593957901001, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9633, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 0.9905954636942526, |
|
"grad_norm": 7.352856636047363, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0061, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.9915370943631634, |
|
"grad_norm": 7.105273723602295, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9133, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 0.9924787250320743, |
|
"grad_norm": 6.221724033355713, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9518, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.9934203557009852, |
|
"grad_norm": 6.9957380294799805, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0659, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.994361986369896, |
|
"grad_norm": 6.70432186126709, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0431, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.995303617038807, |
|
"grad_norm": 6.815949440002441, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9397, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 0.9962452477077178, |
|
"grad_norm": 6.253696918487549, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9773, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.9971868783766287, |
|
"grad_norm": 6.235177040100098, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9691, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 0.9981285090455396, |
|
"grad_norm": 8.551080703735352, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0107, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9990701397144505, |
|
"grad_norm": 6.636978626251221, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 0.9998234442495793, |
|
"step": 5309, |
|
"total_flos": 7.961680948018545e+18, |
|
"train_loss": 3.093570432952174, |
|
"train_runtime": 64923.0121, |
|
"train_samples_per_second": 5.234, |
|
"train_steps_per_second": 0.082 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 5309, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.961680948018545e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|