|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997380141472361, |
|
"eval_steps": 500, |
|
"global_step": 954, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001047943411055803, |
|
"grad_norm": 0.4001561321232938, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 1.3753, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0052397170552790145, |
|
"grad_norm": 0.4325259158841641, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 1.3728, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010479434110558029, |
|
"grad_norm": 0.48537394471497125, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.3694, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015719151165837046, |
|
"grad_norm": 0.21451255069741118, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.3356, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020958868221116058, |
|
"grad_norm": 0.1625576362850241, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.2901, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026198585276395073, |
|
"grad_norm": 0.1442683256295141, |
|
"learning_rate": 5.208333333333334e-05, |
|
"loss": 1.3057, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03143830233167409, |
|
"grad_norm": 0.13774645654676312, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.3118, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03667801938695311, |
|
"grad_norm": 0.1351879639516721, |
|
"learning_rate": 7.291666666666667e-05, |
|
"loss": 1.2465, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.041917736442232116, |
|
"grad_norm": 0.11321368969702118, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.2302, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04715745349751113, |
|
"grad_norm": 0.09641835898020555, |
|
"learning_rate": 9.375e-05, |
|
"loss": 1.1752, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05239717055279015, |
|
"grad_norm": 0.08525335607581626, |
|
"learning_rate": 0.00010416666666666667, |
|
"loss": 1.1696, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05763688760806916, |
|
"grad_norm": 0.08103124164785507, |
|
"learning_rate": 0.00011458333333333333, |
|
"loss": 1.1899, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06287660466334818, |
|
"grad_norm": 0.06901625153652526, |
|
"learning_rate": 0.000125, |
|
"loss": 1.1664, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06811632171862719, |
|
"grad_norm": 0.0674972608334169, |
|
"learning_rate": 0.0001354166666666667, |
|
"loss": 1.1857, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07335603877390622, |
|
"grad_norm": 0.07061975941315685, |
|
"learning_rate": 0.00014583333333333335, |
|
"loss": 1.1566, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07859575582918522, |
|
"grad_norm": 0.06830732833348191, |
|
"learning_rate": 0.00015625, |
|
"loss": 1.1464, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08383547288446423, |
|
"grad_norm": 0.06929791315843523, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.1572, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08907518993974325, |
|
"grad_norm": 0.07358426687980345, |
|
"learning_rate": 0.00017708333333333335, |
|
"loss": 1.1425, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09431490699502226, |
|
"grad_norm": 0.07082497637162344, |
|
"learning_rate": 0.0001875, |
|
"loss": 1.1416, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09955462405030129, |
|
"grad_norm": 0.09545498739533521, |
|
"learning_rate": 0.0001979166666666667, |
|
"loss": 1.1624, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1047943411055803, |
|
"grad_norm": 0.08500702062384047, |
|
"learning_rate": 0.00019998927475076107, |
|
"loss": 1.1323, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11003405816085932, |
|
"grad_norm": 0.09301268675355433, |
|
"learning_rate": 0.00019994570736865406, |
|
"loss": 1.0883, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11527377521613832, |
|
"grad_norm": 0.08575977499880286, |
|
"learning_rate": 0.00019986864211644069, |
|
"loss": 1.1046, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12051349227141735, |
|
"grad_norm": 0.10121427622454875, |
|
"learning_rate": 0.00019975810482336233, |
|
"loss": 1.1154, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12575320932669637, |
|
"grad_norm": 0.09078809335101837, |
|
"learning_rate": 0.00019961413253717213, |
|
"loss": 1.1002, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13099292638197538, |
|
"grad_norm": 0.0781587950026229, |
|
"learning_rate": 0.00019943677351171775, |
|
"loss": 1.1016, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13623264343725439, |
|
"grad_norm": 0.08268644073866073, |
|
"learning_rate": 0.00019922608719076873, |
|
"loss": 1.1026, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1414723604925334, |
|
"grad_norm": 0.0985349651519935, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 1.14, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14671207754781243, |
|
"grad_norm": 0.08307522790061574, |
|
"learning_rate": 0.00019870502626379127, |
|
"loss": 1.1298, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15195179460309144, |
|
"grad_norm": 0.08579822231869356, |
|
"learning_rate": 0.00019839482629689154, |
|
"loss": 1.0967, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15719151165837045, |
|
"grad_norm": 0.07617635505916012, |
|
"learning_rate": 0.0001980516482542224, |
|
"loss": 1.1107, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16243122871364946, |
|
"grad_norm": 0.07813217848338273, |
|
"learning_rate": 0.00019767560715556597, |
|
"loss": 1.1271, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16767094576892846, |
|
"grad_norm": 0.08062953312484893, |
|
"learning_rate": 0.0001972668290351084, |
|
"loss": 1.0962, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1729106628242075, |
|
"grad_norm": 0.079570669107266, |
|
"learning_rate": 0.00019682545089919784, |
|
"loss": 1.1172, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1781503798794865, |
|
"grad_norm": 0.07824690927340332, |
|
"learning_rate": 0.00019635162068042545, |
|
"loss": 1.0977, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18339009693476552, |
|
"grad_norm": 0.07565054237827153, |
|
"learning_rate": 0.0001958454971880441, |
|
"loss": 1.0989, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18862981399004453, |
|
"grad_norm": 0.08232625474832411, |
|
"learning_rate": 0.00019530725005474195, |
|
"loss": 1.1031, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19386953104532356, |
|
"grad_norm": 0.08313986432990468, |
|
"learning_rate": 0.00019473705967978808, |
|
"loss": 1.1188, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19910924810060257, |
|
"grad_norm": 0.08608307927164675, |
|
"learning_rate": 0.00019413511716856972, |
|
"loss": 1.1182, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20434896515588158, |
|
"grad_norm": 0.07591604791733918, |
|
"learning_rate": 0.0001935016242685415, |
|
"loss": 1.1136, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2095886822111606, |
|
"grad_norm": 0.08255303175008252, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 1.0869, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21482839926643962, |
|
"grad_norm": 0.08084692432416707, |
|
"learning_rate": 0.00019214084709295848, |
|
"loss": 1.1077, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22006811632171863, |
|
"grad_norm": 0.07931205352695565, |
|
"learning_rate": 0.0001914140188963917, |
|
"loss": 1.0975, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22530783337699764, |
|
"grad_norm": 0.0771370875424395, |
|
"learning_rate": 0.0001906565523161312, |
|
"loss": 1.1186, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23054755043227665, |
|
"grad_norm": 0.0881145788751443, |
|
"learning_rate": 0.00018986870122518262, |
|
"loss": 1.1124, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23578726748755569, |
|
"grad_norm": 0.1055770930021298, |
|
"learning_rate": 0.00018905072968024425, |
|
"loss": 1.0968, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2410269845428347, |
|
"grad_norm": 0.09095639364627423, |
|
"learning_rate": 0.00018820291183320603, |
|
"loss": 1.126, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2462667015981137, |
|
"grad_norm": 0.07225566008587724, |
|
"learning_rate": 0.00018732553183926443, |
|
"loss": 1.0918, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25150641865339274, |
|
"grad_norm": 0.07767735154463175, |
|
"learning_rate": 0.00018641888376168484, |
|
"loss": 1.1009, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2567461357086717, |
|
"grad_norm": 0.08437806562537126, |
|
"learning_rate": 0.00018548327147324315, |
|
"loss": 1.0768, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.26198585276395076, |
|
"grad_norm": 0.07576721126196213, |
|
"learning_rate": 0.0001845190085543795, |
|
"loss": 1.0878, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26722556981922974, |
|
"grad_norm": 0.08205260181446893, |
|
"learning_rate": 0.00018352641818809848, |
|
"loss": 1.1281, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27246528687450877, |
|
"grad_norm": 0.07975349596755926, |
|
"learning_rate": 0.00018250583305165098, |
|
"loss": 1.1293, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2777050039297878, |
|
"grad_norm": 0.07530550662057361, |
|
"learning_rate": 0.00018145759520503358, |
|
"loss": 1.1147, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2829447209850668, |
|
"grad_norm": 0.0807310288155147, |
|
"learning_rate": 0.00018038205597634393, |
|
"loss": 1.1092, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2881844380403458, |
|
"grad_norm": 0.0783642611125798, |
|
"learning_rate": 0.00017927957584402897, |
|
"loss": 1.0969, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.29342415509562486, |
|
"grad_norm": 0.0834215103482037, |
|
"learning_rate": 0.000178150524316067, |
|
"loss": 1.0951, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.29866387215090384, |
|
"grad_norm": 0.0777991252254151, |
|
"learning_rate": 0.00017699527980612304, |
|
"loss": 1.0974, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3039035892061829, |
|
"grad_norm": 0.07401384328604721, |
|
"learning_rate": 0.00017581422950671942, |
|
"loss": 1.1258, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.30914330626146186, |
|
"grad_norm": 0.0768067427086218, |
|
"learning_rate": 0.00017460776925946417, |
|
"loss": 1.0946, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3143830233167409, |
|
"grad_norm": 0.07212540160444633, |
|
"learning_rate": 0.00017337630342238042, |
|
"loss": 1.1074, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31962274037201993, |
|
"grad_norm": 0.0748307586015756, |
|
"learning_rate": 0.00017212024473438147, |
|
"loss": 1.0954, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3248624574272989, |
|
"grad_norm": 0.07601887269890263, |
|
"learning_rate": 0.00017084001417693703, |
|
"loss": 1.0936, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33010217448257795, |
|
"grad_norm": 0.07281811010206982, |
|
"learning_rate": 0.00016953604083297665, |
|
"loss": 1.1219, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.33534189153785693, |
|
"grad_norm": 0.07523858579513697, |
|
"learning_rate": 0.00016820876174307821, |
|
"loss": 1.1157, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34058160859313596, |
|
"grad_norm": 0.08575069343747709, |
|
"learning_rate": 0.00016685862175898892, |
|
"loss": 1.1179, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.345821325648415, |
|
"grad_norm": 0.07857161375376395, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 1.0844, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.351061042703694, |
|
"grad_norm": 0.07945712197579065, |
|
"learning_rate": 0.00016409157667392457, |
|
"loss": 1.0964, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.356300759758973, |
|
"grad_norm": 0.08254440545274826, |
|
"learning_rate": 0.00016267559897763028, |
|
"loss": 1.094, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36154047681425205, |
|
"grad_norm": 0.07939321839093792, |
|
"learning_rate": 0.0001612386148856771, |
|
"loss": 1.0966, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.36678019386953103, |
|
"grad_norm": 0.08193880396697184, |
|
"learning_rate": 0.0001597811060186141, |
|
"loss": 1.0997, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37201991092481007, |
|
"grad_norm": 0.07719408936904257, |
|
"learning_rate": 0.00015830356087608764, |
|
"loss": 1.0789, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.37725962798008905, |
|
"grad_norm": 0.07549679547307497, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 1.1008, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3824993450353681, |
|
"grad_norm": 0.08172686400286096, |
|
"learning_rate": 0.00015529034917411073, |
|
"loss": 1.1251, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3877390620906471, |
|
"grad_norm": 0.07573300240197864, |
|
"learning_rate": 0.00015375569252470896, |
|
"loss": 1.1121, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3929787791459261, |
|
"grad_norm": 0.07510493645601654, |
|
"learning_rate": 0.00015220301908145905, |
|
"loss": 1.0617, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.39821849620120514, |
|
"grad_norm": 0.07047562795180644, |
|
"learning_rate": 0.00015063284923943031, |
|
"loss": 1.0964, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4034582132564842, |
|
"grad_norm": 0.07710971072693787, |
|
"learning_rate": 0.00014904570925779683, |
|
"loss": 1.0822, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.40869793031176316, |
|
"grad_norm": 0.08283963772289088, |
|
"learning_rate": 0.00014744213108345604, |
|
"loss": 1.0744, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4139376473670422, |
|
"grad_norm": 0.07086438189754951, |
|
"learning_rate": 0.00014582265217274104, |
|
"loss": 1.109, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4191773644223212, |
|
"grad_norm": 0.06876096472263239, |
|
"learning_rate": 0.00014418781531128636, |
|
"loss": 1.1008, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4244170814776002, |
|
"grad_norm": 0.07686166297785638, |
|
"learning_rate": 0.0001425381684321075, |
|
"loss": 1.0829, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.42965679853287925, |
|
"grad_norm": 0.08368209207409245, |
|
"learning_rate": 0.00014087426443195548, |
|
"loss": 1.1336, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43489651558815823, |
|
"grad_norm": 0.0725885844341327, |
|
"learning_rate": 0.00013919666098600753, |
|
"loss": 1.0874, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.44013623264343726, |
|
"grad_norm": 0.07310731092919365, |
|
"learning_rate": 0.0001375059203609562, |
|
"loss": 1.084, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.44537594969871624, |
|
"grad_norm": 0.07788032512060031, |
|
"learning_rate": 0.00013580260922655985, |
|
"loss": 1.102, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4506156667539953, |
|
"grad_norm": 0.07182054106731274, |
|
"learning_rate": 0.00013408729846571714, |
|
"loss": 1.075, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4558553838092743, |
|
"grad_norm": 0.0745557942612687, |
|
"learning_rate": 0.00013236056298312958, |
|
"loss": 1.0936, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4610951008645533, |
|
"grad_norm": 0.07109140712607685, |
|
"learning_rate": 0.00013062298151261592, |
|
"loss": 1.0762, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.46633481791983233, |
|
"grad_norm": 0.07067264389645633, |
|
"learning_rate": 0.00012887513642314373, |
|
"loss": 1.1132, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.47157453497511137, |
|
"grad_norm": 0.08171597552387606, |
|
"learning_rate": 0.00012711761352364172, |
|
"loss": 1.0829, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.47681425203039035, |
|
"grad_norm": 0.07525113073399527, |
|
"learning_rate": 0.00012535100186666, |
|
"loss": 1.0983, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4820539690856694, |
|
"grad_norm": 0.07155593708972481, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 1.0934, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.48729368614094837, |
|
"grad_norm": 0.07568360243136522, |
|
"learning_rate": 0.00012179288352297984, |
|
"loss": 1.1088, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4925334031962274, |
|
"grad_norm": 0.07870992806459551, |
|
"learning_rate": 0.00012000256937760445, |
|
"loss": 1.0886, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.49777312025150644, |
|
"grad_norm": 0.07740807887909781, |
|
"learning_rate": 0.00011820555115770255, |
|
"loss": 1.0769, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5030128373067855, |
|
"grad_norm": 0.08195630153403244, |
|
"learning_rate": 0.00011640243115310218, |
|
"loss": 1.0676, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5082525543620644, |
|
"grad_norm": 0.07035497008178994, |
|
"learning_rate": 0.00011459381369870974, |
|
"loss": 1.0847, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5134922714173434, |
|
"grad_norm": 0.07213741826205265, |
|
"learning_rate": 0.00011278030497196049, |
|
"loss": 1.0882, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5187319884726225, |
|
"grad_norm": 0.07436809187729645, |
|
"learning_rate": 0.00011096251278965172, |
|
"loss": 1.0897, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5239717055279015, |
|
"grad_norm": 0.08114263617937822, |
|
"learning_rate": 0.00010914104640422679, |
|
"loss": 1.091, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5292114225831805, |
|
"grad_norm": 0.07778710874522761, |
|
"learning_rate": 0.00010731651629957722, |
|
"loss": 1.0714, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5344511396384595, |
|
"grad_norm": 0.08462996195681859, |
|
"learning_rate": 0.00010548953398643275, |
|
"loss": 1.0774, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5396908566937385, |
|
"grad_norm": 0.07951504140245977, |
|
"learning_rate": 0.00010366071179740706, |
|
"loss": 1.0849, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5449305737490175, |
|
"grad_norm": 0.07543823682325243, |
|
"learning_rate": 0.00010183066268176776, |
|
"loss": 1.0965, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5501702908042966, |
|
"grad_norm": 0.07866000568639331, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0606, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5554100078595756, |
|
"grad_norm": 0.0734883472447597, |
|
"learning_rate": 9.816933731823231e-05, |
|
"loss": 1.1037, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5606497249148547, |
|
"grad_norm": 0.06987918160697645, |
|
"learning_rate": 9.633928820259295e-05, |
|
"loss": 1.0805, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5658894419701336, |
|
"grad_norm": 0.07153298757189964, |
|
"learning_rate": 9.451046601356725e-05, |
|
"loss": 1.1353, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5711291590254126, |
|
"grad_norm": 0.0744809511267128, |
|
"learning_rate": 9.268348370042281e-05, |
|
"loss": 1.0661, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5763688760806917, |
|
"grad_norm": 0.07467776042097098, |
|
"learning_rate": 9.085895359577324e-05, |
|
"loss": 1.0986, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5816085931359707, |
|
"grad_norm": 0.07805443340486796, |
|
"learning_rate": 8.903748721034827e-05, |
|
"loss": 1.1047, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5868483101912497, |
|
"grad_norm": 0.07920975635426553, |
|
"learning_rate": 8.721969502803954e-05, |
|
"loss": 1.0786, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5920880272465286, |
|
"grad_norm": 0.07525491817078783, |
|
"learning_rate": 8.540618630129029e-05, |
|
"loss": 1.074, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5973277443018077, |
|
"grad_norm": 0.07250304608041182, |
|
"learning_rate": 8.359756884689784e-05, |
|
"loss": 1.0878, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6025674613570867, |
|
"grad_norm": 0.0733085547464114, |
|
"learning_rate": 8.179444884229746e-05, |
|
"loss": 1.0903, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6078071784123658, |
|
"grad_norm": 0.07942414834860045, |
|
"learning_rate": 7.999743062239557e-05, |
|
"loss": 1.0709, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6130468954676448, |
|
"grad_norm": 0.07667678452382841, |
|
"learning_rate": 7.820711647702017e-05, |
|
"loss": 1.1036, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6182866125229237, |
|
"grad_norm": 0.0736959002458591, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 1.1069, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6235263295782028, |
|
"grad_norm": 0.07339877720581538, |
|
"learning_rate": 7.464899813334001e-05, |
|
"loss": 1.0663, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6287660466334818, |
|
"grad_norm": 0.0755698325420648, |
|
"learning_rate": 7.28823864763583e-05, |
|
"loss": 1.0794, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6340057636887608, |
|
"grad_norm": 0.07192880469266458, |
|
"learning_rate": 7.112486357685631e-05, |
|
"loss": 1.0828, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6392454807440399, |
|
"grad_norm": 0.07254861043098242, |
|
"learning_rate": 6.937701848738406e-05, |
|
"loss": 1.0756, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6444851977993188, |
|
"grad_norm": 0.07557384049733529, |
|
"learning_rate": 6.763943701687045e-05, |
|
"loss": 1.0707, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6497249148545978, |
|
"grad_norm": 0.07452016389965133, |
|
"learning_rate": 6.591270153428288e-05, |
|
"loss": 1.1175, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6549646319098769, |
|
"grad_norm": 0.07996587854125373, |
|
"learning_rate": 6.419739077344016e-05, |
|
"loss": 1.0689, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6602043489651559, |
|
"grad_norm": 0.07413365338024828, |
|
"learning_rate": 6.249407963904382e-05, |
|
"loss": 1.0751, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6654440660204349, |
|
"grad_norm": 0.07459048337829312, |
|
"learning_rate": 6.080333901399251e-05, |
|
"loss": 1.093, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6706837830757139, |
|
"grad_norm": 0.07384856775055335, |
|
"learning_rate": 5.9125735568044524e-05, |
|
"loss": 1.08, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6759235001309929, |
|
"grad_norm": 0.07929304895930443, |
|
"learning_rate": 5.746183156789252e-05, |
|
"loss": 1.0676, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6811632171862719, |
|
"grad_norm": 0.07350637434626273, |
|
"learning_rate": 5.581218468871366e-05, |
|
"loss": 1.1031, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.686402934241551, |
|
"grad_norm": 0.07540040766752996, |
|
"learning_rate": 5.417734782725896e-05, |
|
"loss": 1.0757, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.69164265129683, |
|
"grad_norm": 0.07568774804377028, |
|
"learning_rate": 5.2557868916543994e-05, |
|
"loss": 1.0876, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.696882368352109, |
|
"grad_norm": 0.07333524606614103, |
|
"learning_rate": 5.0954290742203195e-05, |
|
"loss": 1.0983, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.702122085407388, |
|
"grad_norm": 0.07303211978160114, |
|
"learning_rate": 4.936715076056975e-05, |
|
"loss": 1.0786, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.707361802462667, |
|
"grad_norm": 0.07783295637332018, |
|
"learning_rate": 4.779698091854098e-05, |
|
"loss": 1.0998, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.712601519517946, |
|
"grad_norm": 0.07902938086530782, |
|
"learning_rate": 4.624430747529102e-05, |
|
"loss": 1.0847, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7178412365732251, |
|
"grad_norm": 0.07352763796685166, |
|
"learning_rate": 4.4709650825889283e-05, |
|
"loss": 1.0749, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7230809536285041, |
|
"grad_norm": 0.07547148249723341, |
|
"learning_rate": 4.3193525326884435e-05, |
|
"loss": 1.0768, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.728320670683783, |
|
"grad_norm": 0.0746832803560011, |
|
"learning_rate": 4.169643912391241e-05, |
|
"loss": 1.1248, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7335603877390621, |
|
"grad_norm": 0.0716047892433064, |
|
"learning_rate": 4.021889398138593e-05, |
|
"loss": 1.114, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7388001047943411, |
|
"grad_norm": 0.07737137661226196, |
|
"learning_rate": 3.87613851143229e-05, |
|
"loss": 1.0713, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7440398218496201, |
|
"grad_norm": 0.07385277062243696, |
|
"learning_rate": 3.732440102236975e-05, |
|
"loss": 1.0925, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7492795389048992, |
|
"grad_norm": 0.0721480276098664, |
|
"learning_rate": 3.5908423326075456e-05, |
|
"loss": 1.0754, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7545192559601781, |
|
"grad_norm": 0.0755228681817193, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 1.1039, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7597589730154571, |
|
"grad_norm": 0.07201430505747895, |
|
"learning_rate": 3.314137824101111e-05, |
|
"loss": 1.0818, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7649986900707362, |
|
"grad_norm": 0.07923887248519075, |
|
"learning_rate": 3.179123825692178e-05, |
|
"loss": 1.0938, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7702384071260152, |
|
"grad_norm": 0.08502340793939184, |
|
"learning_rate": 3.0463959167023336e-05, |
|
"loss": 1.1037, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7754781241812942, |
|
"grad_norm": 0.07305649086014973, |
|
"learning_rate": 2.9159985823062997e-05, |
|
"loss": 1.0617, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7807178412365732, |
|
"grad_norm": 0.07363395579085769, |
|
"learning_rate": 2.7879755265618555e-05, |
|
"loss": 1.0816, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7859575582918522, |
|
"grad_norm": 0.07380591423429197, |
|
"learning_rate": 2.6623696577619627e-05, |
|
"loss": 1.1121, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7911972753471312, |
|
"grad_norm": 0.07553408254615797, |
|
"learning_rate": 2.539223074053585e-05, |
|
"loss": 1.1121, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7964369924024103, |
|
"grad_norm": 0.07091968630997039, |
|
"learning_rate": 2.418577049328058e-05, |
|
"loss": 1.0779, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8016767094576893, |
|
"grad_norm": 0.07111148174207031, |
|
"learning_rate": 2.3004720193876973e-05, |
|
"loss": 1.0769, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8069164265129684, |
|
"grad_norm": 0.07286026119404068, |
|
"learning_rate": 2.1849475683932996e-05, |
|
"loss": 1.0852, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8121561435682473, |
|
"grad_norm": 0.07285993869373032, |
|
"learning_rate": 2.0720424155971042e-05, |
|
"loss": 1.0702, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8173958606235263, |
|
"grad_norm": 0.0768385830643589, |
|
"learning_rate": 1.961794402365611e-05, |
|
"loss": 1.0652, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8226355776788054, |
|
"grad_norm": 0.07700519821749924, |
|
"learning_rate": 1.854240479496643e-05, |
|
"loss": 1.105, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8278752947340844, |
|
"grad_norm": 0.07789933149114912, |
|
"learning_rate": 1.7494166948349055e-05, |
|
"loss": 1.0992, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8331150117893634, |
|
"grad_norm": 0.0713531816025941, |
|
"learning_rate": 1.647358181190153e-05, |
|
"loss": 1.0672, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8383547288446423, |
|
"grad_norm": 0.07496958584065704, |
|
"learning_rate": 1.5480991445620542e-05, |
|
"loss": 1.1105, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8435944458999214, |
|
"grad_norm": 0.07272092802927198, |
|
"learning_rate": 1.4516728526756874e-05, |
|
"loss": 1.1139, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8488341629552004, |
|
"grad_norm": 0.07161568950716177, |
|
"learning_rate": 1.3581116238315195e-05, |
|
"loss": 1.085, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8540738800104795, |
|
"grad_norm": 0.07695666983129193, |
|
"learning_rate": 1.2674468160735587e-05, |
|
"loss": 1.0616, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8593135970657585, |
|
"grad_norm": 0.07401216452154172, |
|
"learning_rate": 1.1797088166794e-05, |
|
"loss": 1.0734, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8645533141210374, |
|
"grad_norm": 0.07259433622324657, |
|
"learning_rate": 1.0949270319755766e-05, |
|
"loss": 1.0684, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8697930311763165, |
|
"grad_norm": 0.07530462440367772, |
|
"learning_rate": 1.013129877481741e-05, |
|
"loss": 1.0983, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8750327482315955, |
|
"grad_norm": 0.07005327786295418, |
|
"learning_rate": 9.3434476838688e-06, |
|
"loss": 1.0959, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8802724652868745, |
|
"grad_norm": 0.07386728875097175, |
|
"learning_rate": 8.585981103608342e-06, |
|
"loss": 1.0778, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8855121823421536, |
|
"grad_norm": 0.07241052446613962, |
|
"learning_rate": 7.859152907041545e-06, |
|
"loss": 1.0972, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8907518993974325, |
|
"grad_norm": 0.0752050904544279, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 1.0816, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8959916164527115, |
|
"grad_norm": 0.07365477033333792, |
|
"learning_rate": 6.498375731458528e-06, |
|
"loss": 1.0838, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9012313335079906, |
|
"grad_norm": 0.07349597369238159, |
|
"learning_rate": 5.864882831430274e-06, |
|
"loss": 1.0655, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9064710505632696, |
|
"grad_norm": 0.0744936274605184, |
|
"learning_rate": 5.262940320211951e-06, |
|
"loss": 1.1006, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9117107676185486, |
|
"grad_norm": 0.07094867075036164, |
|
"learning_rate": 4.692749945258057e-06, |
|
"loss": 1.0821, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9169504846738276, |
|
"grad_norm": 0.07625036960430226, |
|
"learning_rate": 4.154502811955907e-06, |
|
"loss": 1.0944, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9221902017291066, |
|
"grad_norm": 0.0723629686619488, |
|
"learning_rate": 3.6483793195745684e-06, |
|
"loss": 1.083, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9274299187843856, |
|
"grad_norm": 0.0763650337980307, |
|
"learning_rate": 3.1745491008021598e-06, |
|
"loss": 1.081, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9326696358396647, |
|
"grad_norm": 0.0728944319249245, |
|
"learning_rate": 2.7331709648916073e-06, |
|
"loss": 1.1094, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9379093528949437, |
|
"grad_norm": 0.07193879908153704, |
|
"learning_rate": 2.3243928444340426e-06, |
|
"loss": 1.0809, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9431490699502227, |
|
"grad_norm": 0.07421423217890344, |
|
"learning_rate": 1.9483517457776436e-06, |
|
"loss": 1.09, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9483887870055017, |
|
"grad_norm": 0.07663776082455946, |
|
"learning_rate": 1.6051737031084536e-06, |
|
"loss": 1.0814, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9536285040607807, |
|
"grad_norm": 0.07329159457647504, |
|
"learning_rate": 1.2949737362087156e-06, |
|
"loss": 1.0976, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9588682211160597, |
|
"grad_norm": 0.07520915566066837, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 1.1012, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9641079381713388, |
|
"grad_norm": 0.07528058322802142, |
|
"learning_rate": 7.73912809231292e-07, |
|
"loss": 1.095, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9693476552266178, |
|
"grad_norm": 0.10802859507213024, |
|
"learning_rate": 5.632264882822758e-07, |
|
"loss": 1.1008, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9745873722818967, |
|
"grad_norm": 0.07235805606525178, |
|
"learning_rate": 3.8586746282788244e-07, |
|
"loss": 1.105, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9798270893371758, |
|
"grad_norm": 0.07225406845866389, |
|
"learning_rate": 2.4189517663767424e-07, |
|
"loss": 1.0796, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9850668063924548, |
|
"grad_norm": 0.07529138400314424, |
|
"learning_rate": 1.3135788355934652e-07, |
|
"loss": 1.1061, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9903065234477338, |
|
"grad_norm": 0.0904445943158699, |
|
"learning_rate": 5.4292631345942424e-08, |
|
"loss": 1.0877, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9955462405030129, |
|
"grad_norm": 0.07392400364849133, |
|
"learning_rate": 1.0725249238940915e-08, |
|
"loss": 1.0649, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9997380141472361, |
|
"eval_loss": 0.9753671884536743, |
|
"eval_runtime": 2.0953, |
|
"eval_samples_per_second": 3.341, |
|
"eval_steps_per_second": 0.955, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.9997380141472361, |
|
"step": 954, |
|
"total_flos": 1.1997126510772224e+16, |
|
"train_loss": 1.1073102571179532, |
|
"train_runtime": 19660.0543, |
|
"train_samples_per_second": 3.106, |
|
"train_steps_per_second": 0.049 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 954, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1997126510772224e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|